76828ee7673571a0d4fe3491bb6fa00e4be907d8
[openssl.git] / crypto / aes / asm / aes-sparcv9.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
8 #
9 # Version 1.1
10 #
11 # The major reason for undertaken effort was to mitigate the hazard of
12 # cache-timing attack. This is [currently and initially!] addressed in
13 # two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
14 # 2. References to them are scheduled for L2 cache latency, meaning
15 # that the tables don't have to reside in L1 cache. Once again, this
16 # is an initial draft and one should expect more countermeasures to
17 # be implemented...
18 #
19 # Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
20 # round.
21 #
22 # Even though performance was not the primary goal [on the contrary,
23 # extra shifts "induced" by compressed S-box and longer loop epilogue
24 # "induced" by scheduling for L2 have negative effect on performance],
25 # the code turned out to run in ~23 cycles per processed byte en-/
26 # decrypted with 128-bit key. This is pretty good result for code
27 # with mentioned qualities and UltraSPARC core. Compared to Sun C
28 # generated code my encrypt procedure runs just few percents faster,
29 # while decrypt one - whole 50% faster [yes, Sun C failed to generate
30 # optimal decrypt procedure]. Compared to GNU C generated code both
31 # procedures are more than 60% faster:-)
32
33 $bits=32;
34 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
35 if ($bits==64)  { $bias=2047; $frame=192; }
36 else            { $bias=0;    $frame=112; }
37 $locals=16;
38
39 $acc0="%l0";
40 $acc1="%o0";
41 $acc2="%o1";
42 $acc3="%o2";
43
44 $acc4="%l1";
45 $acc5="%o3";
46 $acc6="%o4";
47 $acc7="%o5";
48
49 $acc8="%l2";
50 $acc9="%o7";
51 $acc10="%g1";
52 $acc11="%g2";
53
54 $acc12="%l3";
55 $acc13="%g3";
56 $acc14="%g4";
57 $acc15="%g5";
58
59 $t0="%l4";
60 $t1="%l5";
61 $t2="%l6";
62 $t3="%l7";
63
64 $s0="%i0";
65 $s1="%i1";
66 $s2="%i2";
67 $s3="%i3";
68 $tbl="%i4";
69 $key="%i5";
70 $rounds="%i7";  # aliases with return address, which is off-loaded to stack
71
72 sub _data_word()
73 { my $i;
74     while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
75 }
76
77 $code.=<<___ if ($bits==64);
78 .register       %g2,#scratch
79 .register       %g3,#scratch
80 ___
81 $code.=<<___;
82 .section        ".text",#alloc,#execinstr
83
84 .align  256
85 AES_Te:
86 ___
87 &_data_word(
88         0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
89         0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
90         0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
91         0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
92         0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
93         0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
94         0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
95         0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
96         0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
97         0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
98         0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
99         0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
100         0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
101         0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
102         0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
103         0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
104         0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
105         0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
106         0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
107         0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
108         0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
109         0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
110         0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
111         0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
112         0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
113         0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
114         0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
115         0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
116         0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
117         0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
118         0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
119         0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
120         0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
121         0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
122         0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
123         0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
124         0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
125         0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
126         0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
127         0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
128         0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
129         0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
130         0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
131         0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
132         0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
133         0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
134         0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
135         0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
136         0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
137         0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
138         0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
139         0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
140         0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
141         0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
142         0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
143         0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
144         0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
145         0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
146         0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
147         0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
148         0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
149         0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
150         0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
151         0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
152 $code.=<<___;
153         .byte   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
154         .byte   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
155         .byte   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
156         .byte   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
157         .byte   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
158         .byte   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
159         .byte   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
160         .byte   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
161         .byte   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
162         .byte   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
163         .byte   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
164         .byte   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
165         .byte   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
166         .byte   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
167         .byte   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
168         .byte   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
169         .byte   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
170         .byte   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
171         .byte   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
172         .byte   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
173         .byte   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
174         .byte   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
175         .byte   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
176         .byte   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
177         .byte   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
178         .byte   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
179         .byte   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
180         .byte   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
181         .byte   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
182         .byte   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
183         .byte   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
184         .byte   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
185 .type   AES_Te,#object
186 .size   AES_Te,(.-AES_Te)
187
188 .align  64
189 .skip   16
190 _sparcv9_AES_encrypt:
191         save    %sp,-$frame-$locals,%sp
192         stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
193         ld      [$key+240],$rounds
194         ld      [$key+0],$t0
195         ld      [$key+4],$t1                    !
196         ld      [$key+8],$t2
197         srl     $rounds,1,$rounds
198         xor     $t0,$s0,$s0
199         ld      [$key+12],$t3
200         srl     $s0,21,$acc0
201         xor     $t1,$s1,$s1
202         ld      [$key+16],$t0
203         srl     $s1,13,$acc1                    !
204         xor     $t2,$s2,$s2
205         ld      [$key+20],$t1
206         xor     $t3,$s3,$s3
207         ld      [$key+24],$t2
208         and     $acc0,2040,$acc0
209         ld      [$key+28],$t3
210         nop
211 .Lenc_loop:
212         srl     $s2,5,$acc2                     !
213         and     $acc1,2040,$acc1
214         ldx     [$tbl+$acc0],$acc0
215         sll     $s3,3,$acc3
216         and     $acc2,2040,$acc2
217         ldx     [$tbl+$acc1],$acc1
218         srl     $s1,21,$acc4
219         and     $acc3,2040,$acc3
220         ldx     [$tbl+$acc2],$acc2              !
221         srl     $s2,13,$acc5
222         and     $acc4,2040,$acc4
223         ldx     [$tbl+$acc3],$acc3
224         srl     $s3,5,$acc6
225         and     $acc5,2040,$acc5
226         ldx     [$tbl+$acc4],$acc4
227         fmovs   %f0,%f0
228         sll     $s0,3,$acc7                     !
229         and     $acc6,2040,$acc6
230         ldx     [$tbl+$acc5],$acc5
231         srl     $s2,21,$acc8
232         and     $acc7,2040,$acc7
233         ldx     [$tbl+$acc6],$acc6
234         srl     $s3,13,$acc9
235         and     $acc8,2040,$acc8
236         ldx     [$tbl+$acc7],$acc7              !
237         srl     $s0,5,$acc10
238         and     $acc9,2040,$acc9
239         ldx     [$tbl+$acc8],$acc8
240         sll     $s1,3,$acc11
241         and     $acc10,2040,$acc10
242         ldx     [$tbl+$acc9],$acc9
243         fmovs   %f0,%f0
244         srl     $s3,21,$acc12                   !
245         and     $acc11,2040,$acc11
246         ldx     [$tbl+$acc10],$acc10
247         srl     $s0,13,$acc13
248         and     $acc12,2040,$acc12
249         ldx     [$tbl+$acc11],$acc11
250         srl     $s1,5,$acc14
251         and     $acc13,2040,$acc13
252         ldx     [$tbl+$acc12],$acc12            !
253         sll     $s2,3,$acc15
254         and     $acc14,2040,$acc14
255         ldx     [$tbl+$acc13],$acc13
256         and     $acc15,2040,$acc15
257         add     $key,32,$key
258         ldx     [$tbl+$acc14],$acc14
259         fmovs   %f0,%f0
260         subcc   $rounds,1,$rounds               !
261         ldx     [$tbl+$acc15],$acc15
262         bz,a,pn %icc,.Lenc_last
263         add     $tbl,2048,$rounds
264
265                 srlx    $acc1,8,$acc1
266                 xor     $acc0,$t0,$t0
267         ld      [$key+0],$s0
268         fmovs   %f0,%f0
269                 srlx    $acc2,16,$acc2          !
270                 xor     $acc1,$t0,$t0
271         ld      [$key+4],$s1
272                 srlx    $acc3,24,$acc3
273                 xor     $acc2,$t0,$t0
274         ld      [$key+8],$s2
275                 srlx    $acc5,8,$acc5
276                 xor     $acc3,$t0,$t0
277         ld      [$key+12],$s3                   !
278                 srlx    $acc6,16,$acc6
279                 xor     $acc4,$t1,$t1
280         fmovs   %f0,%f0
281                 srlx    $acc7,24,$acc7
282                 xor     $acc5,$t1,$t1
283                 srlx    $acc9,8,$acc9
284                 xor     $acc6,$t1,$t1
285                 srlx    $acc10,16,$acc10        !
286                 xor     $acc7,$t1,$t1
287                 srlx    $acc11,24,$acc11
288                 xor     $acc8,$t2,$t2
289                 srlx    $acc13,8,$acc13
290                 xor     $acc9,$t2,$t2
291                 srlx    $acc14,16,$acc14
292                 xor     $acc10,$t2,$t2
293                 srlx    $acc15,24,$acc15        !
294                 xor     $acc11,$t2,$t2
295                 xor     $acc12,$acc14,$acc14
296                 xor     $acc13,$t3,$t3
297         srl     $t0,21,$acc0
298                 xor     $acc14,$t3,$t3
299         srl     $t1,13,$acc1
300                 xor     $acc15,$t3,$t3
301
302         and     $acc0,2040,$acc0                !
303         srl     $t2,5,$acc2
304         and     $acc1,2040,$acc1
305         ldx     [$tbl+$acc0],$acc0
306         sll     $t3,3,$acc3
307         and     $acc2,2040,$acc2
308         ldx     [$tbl+$acc1],$acc1
309         fmovs   %f0,%f0
310         srl     $t1,21,$acc4                    !
311         and     $acc3,2040,$acc3
312         ldx     [$tbl+$acc2],$acc2
313         srl     $t2,13,$acc5
314         and     $acc4,2040,$acc4
315         ldx     [$tbl+$acc3],$acc3
316         srl     $t3,5,$acc6
317         and     $acc5,2040,$acc5
318         ldx     [$tbl+$acc4],$acc4              !
319         sll     $t0,3,$acc7
320         and     $acc6,2040,$acc6
321         ldx     [$tbl+$acc5],$acc5
322         srl     $t2,21,$acc8
323         and     $acc7,2040,$acc7
324         ldx     [$tbl+$acc6],$acc6
325         fmovs   %f0,%f0
326         srl     $t3,13,$acc9                    !
327         and     $acc8,2040,$acc8
328         ldx     [$tbl+$acc7],$acc7
329         srl     $t0,5,$acc10
330         and     $acc9,2040,$acc9
331         ldx     [$tbl+$acc8],$acc8
332         sll     $t1,3,$acc11
333         and     $acc10,2040,$acc10
334         ldx     [$tbl+$acc9],$acc9              !
335         srl     $t3,21,$acc12
336         and     $acc11,2040,$acc11
337         ldx     [$tbl+$acc10],$acc10
338         srl     $t0,13,$acc13
339         and     $acc12,2040,$acc12
340         ldx     [$tbl+$acc11],$acc11
341         fmovs   %f0,%f0
342         srl     $t1,5,$acc14                    !
343         and     $acc13,2040,$acc13
344         ldx     [$tbl+$acc12],$acc12
345         sll     $t2,3,$acc15
346         and     $acc14,2040,$acc14
347         ldx     [$tbl+$acc13],$acc13
348                 srlx    $acc1,8,$acc1
349         and     $acc15,2040,$acc15
350         ldx     [$tbl+$acc14],$acc14            !
351
352                 srlx    $acc2,16,$acc2
353                 xor     $acc0,$s0,$s0
354         ldx     [$tbl+$acc15],$acc15
355                 srlx    $acc3,24,$acc3
356                 xor     $acc1,$s0,$s0
357         ld      [$key+16],$t0
358         fmovs   %f0,%f0
359                 srlx    $acc5,8,$acc5           !
360                 xor     $acc2,$s0,$s0
361         ld      [$key+20],$t1
362                 srlx    $acc6,16,$acc6
363                 xor     $acc3,$s0,$s0
364         ld      [$key+24],$t2
365                 srlx    $acc7,24,$acc7
366                 xor     $acc4,$s1,$s1
367         ld      [$key+28],$t3                   !
368                 srlx    $acc9,8,$acc9
369                 xor     $acc5,$s1,$s1
370         ldx     [$tbl+2048+0],%g0               ! prefetch te4
371                 srlx    $acc10,16,$acc10
372                 xor     $acc6,$s1,$s1
373         ldx     [$tbl+2048+32],%g0              ! prefetch te4
374                 srlx    $acc11,24,$acc11
375                 xor     $acc7,$s1,$s1
376         ldx     [$tbl+2048+64],%g0              ! prefetch te4
377                 srlx    $acc13,8,$acc13
378                 xor     $acc8,$s2,$s2
379         ldx     [$tbl+2048+96],%g0              ! prefetch te4
380                 srlx    $acc14,16,$acc14        !
381                 xor     $acc9,$s2,$s2
382         ldx     [$tbl+2048+128],%g0             ! prefetch te4
383                 srlx    $acc15,24,$acc15
384                 xor     $acc10,$s2,$s2
385         ldx     [$tbl+2048+160],%g0             ! prefetch te4
386         srl     $s0,21,$acc0
387                 xor     $acc11,$s2,$s2
388         ldx     [$tbl+2048+192],%g0             ! prefetch te4
389                 xor     $acc12,$acc14,$acc14
390                 xor     $acc13,$s3,$s3
391         ldx     [$tbl+2048+224],%g0             ! prefetch te4
392         srl     $s1,13,$acc1                    !
393                 xor     $acc14,$s3,$s3
394                 xor     $acc15,$s3,$s3
395         ba      .Lenc_loop
396         and     $acc0,2040,$acc0
397
398 .align  32
399 .Lenc_last:
400                 srlx    $acc1,8,$acc1           !
401                 xor     $acc0,$t0,$t0
402         ld      [$key+0],$s0
403                 srlx    $acc2,16,$acc2
404                 xor     $acc1,$t0,$t0
405         ld      [$key+4],$s1
406                 srlx    $acc3,24,$acc3
407                 xor     $acc2,$t0,$t0
408         ld      [$key+8],$s2                    !
409                 srlx    $acc5,8,$acc5
410                 xor     $acc3,$t0,$t0
411         ld      [$key+12],$s3
412                 srlx    $acc6,16,$acc6
413                 xor     $acc4,$t1,$t1
414                 srlx    $acc7,24,$acc7
415                 xor     $acc5,$t1,$t1
416                 srlx    $acc9,8,$acc9           !
417                 xor     $acc6,$t1,$t1
418                 srlx    $acc10,16,$acc10
419                 xor     $acc7,$t1,$t1
420                 srlx    $acc11,24,$acc11
421                 xor     $acc8,$t2,$t2
422                 srlx    $acc13,8,$acc13
423                 xor     $acc9,$t2,$t2
424                 srlx    $acc14,16,$acc14        !
425                 xor     $acc10,$t2,$t2
426                 srlx    $acc15,24,$acc15
427                 xor     $acc11,$t2,$t2
428                 xor     $acc12,$acc14,$acc14
429                 xor     $acc13,$t3,$t3
430         srl     $t0,24,$acc0
431                 xor     $acc14,$t3,$t3
432         srl     $t1,16,$acc1                    !
433                 xor     $acc15,$t3,$t3
434
435         srl     $t2,8,$acc2
436         and     $acc1,255,$acc1
437         ldub    [$rounds+$acc0],$acc0
438         srl     $t1,24,$acc4
439         and     $acc2,255,$acc2
440         ldub    [$rounds+$acc1],$acc1
441         srl     $t2,16,$acc5                    !
442         and     $t3,255,$acc3
443         ldub    [$rounds+$acc2],$acc2
444         ldub    [$rounds+$acc3],$acc3
445         srl     $t3,8,$acc6
446         and     $acc5,255,$acc5
447         ldub    [$rounds+$acc4],$acc4
448         fmovs   %f0,%f0
449         srl     $t2,24,$acc8                    !
450         and     $acc6,255,$acc6
451         ldub    [$rounds+$acc5],$acc5
452         srl     $t3,16,$acc9
453         and     $t0,255,$acc7
454         ldub    [$rounds+$acc6],$acc6
455         ldub    [$rounds+$acc7],$acc7
456         fmovs   %f0,%f0
457         srl     $t0,8,$acc10                    !
458         and     $acc9,255,$acc9
459         ldub    [$rounds+$acc8],$acc8
460         srl     $t3,24,$acc12
461         and     $acc10,255,$acc10
462         ldub    [$rounds+$acc9],$acc9
463         srl     $t0,16,$acc13
464         and     $t1,255,$acc11
465         ldub    [$rounds+$acc10],$acc10         !
466         srl     $t1,8,$acc14
467         and     $acc13,255,$acc13
468         ldub    [$rounds+$acc11],$acc11
469         ldub    [$rounds+$acc12],$acc12
470         and     $acc14,255,$acc14
471         ldub    [$rounds+$acc13],$acc13
472         and     $t2,255,$acc15
473         ldub    [$rounds+$acc14],$acc14         !
474
475                 sll     $acc0,24,$acc0
476                 xor     $acc3,$s0,$s0
477         ldub    [$rounds+$acc15],$acc15
478                 sll     $acc1,16,$acc1
479                 xor     $acc0,$s0,$s0
480         ldx     [%sp+$bias+$frame+0],%i7        ! restore return address
481         fmovs   %f0,%f0
482                 sll     $acc2,8,$acc2           !
483                 xor     $acc1,$s0,$s0
484                 sll     $acc4,24,$acc4
485                 xor     $acc2,$s0,$s0
486                 sll     $acc5,16,$acc5
487                 xor     $acc7,$s1,$s1
488                 sll     $acc6,8,$acc6
489                 xor     $acc4,$s1,$s1
490                 sll     $acc8,24,$acc8          !
491                 xor     $acc5,$s1,$s1
492                 sll     $acc9,16,$acc9
493                 xor     $acc11,$s2,$s2
494                 sll     $acc10,8,$acc10
495                 xor     $acc6,$s1,$s1
496                 sll     $acc12,24,$acc12
497                 xor     $acc8,$s2,$s2
498                 sll     $acc13,16,$acc13        !
499                 xor     $acc9,$s2,$s2
500                 sll     $acc14,8,$acc14
501                 xor     $acc10,$s2,$s2
502                 xor     $acc12,$acc14,$acc14
503                 xor     $acc13,$s3,$s3
504                 xor     $acc14,$s3,$s3
505                 xor     $acc15,$s3,$s3
506
507         ret
508         restore
509 .type   _sparcv9_AES_encrypt,#function
510 .size   _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
511
512 .align  32
513 .globl  AES_encrypt
514 AES_encrypt:
515         or      %o0,%o1,%g1
516         andcc   %g1,3,%g0
517         bnz,pn  %xcc,.Lunaligned_enc
518         save    %sp,-$frame,%sp
519
520         ld      [%i0+0],%o0
521         ld      [%i0+4],%o1
522         ld      [%i0+8],%o2
523         ld      [%i0+12],%o3
524
525 1:      call    .+8
526         sub     %o7,1b-AES_Te,%o4
527         call    _sparcv9_AES_encrypt
528         mov     %i2,%o5
529
530         st      %o0,[%i1+0]
531         st      %o1,[%i1+4]
532         st      %o2,[%i1+8]
533         st      %o3,[%i1+12]
534
535         ret
536         restore
537
538 .align  32
539 .Lunaligned_enc:
540         ldub    [%i0+0],%l0
541         ldub    [%i0+1],%l1
542         ldub    [%i0+2],%l2
543
544         sll     %l0,24,%l0
545         ldub    [%i0+3],%l3
546         sll     %l1,16,%l1
547         ldub    [%i0+4],%l4
548         sll     %l2,8,%l2
549         or      %l1,%l0,%l0
550         ldub    [%i0+5],%l5
551         sll     %l4,24,%l4
552         or      %l3,%l2,%l2
553         ldub    [%i0+6],%l6
554         sll     %l5,16,%l5
555         or      %l0,%l2,%o0
556         ldub    [%i0+7],%l7
557
558         sll     %l6,8,%l6
559         or      %l5,%l4,%l4
560         ldub    [%i0+8],%l0
561         or      %l7,%l6,%l6
562         ldub    [%i0+9],%l1
563         or      %l4,%l6,%o1
564         ldub    [%i0+10],%l2
565
566         sll     %l0,24,%l0
567         ldub    [%i0+11],%l3
568         sll     %l1,16,%l1
569         ldub    [%i0+12],%l4
570         sll     %l2,8,%l2
571         or      %l1,%l0,%l0
572         ldub    [%i0+13],%l5
573         sll     %l4,24,%l4
574         or      %l3,%l2,%l2
575         ldub    [%i0+14],%l6
576         sll     %l5,16,%l5
577         or      %l0,%l2,%o2
578         ldub    [%i0+15],%l7
579
580         sll     %l6,8,%l6
581         or      %l5,%l4,%l4
582         or      %l7,%l6,%l6
583         or      %l4,%l6,%o3
584
585 1:      call    .+8
586         sub     %o7,1b-AES_Te,%o4
587         call    _sparcv9_AES_encrypt
588         mov     %i2,%o5
589
590         srl     %o0,24,%l0
591         srl     %o0,16,%l1
592         stb     %l0,[%i1+0]
593         srl     %o0,8,%l2
594         stb     %l1,[%i1+1]
595         stb     %l2,[%i1+2]
596         srl     %o1,24,%l4
597         stb     %o0,[%i1+3]
598
599         srl     %o1,16,%l5
600         stb     %l4,[%i1+4]
601         srl     %o1,8,%l6
602         stb     %l5,[%i1+5]
603         stb     %l6,[%i1+6]
604         srl     %o2,24,%l0
605         stb     %o1,[%i1+7]
606
607         srl     %o2,16,%l1
608         stb     %l0,[%i1+8]
609         srl     %o2,8,%l2
610         stb     %l1,[%i1+9]
611         stb     %l2,[%i1+10]
612         srl     %o3,24,%l4
613         stb     %o2,[%i1+11]
614
615         srl     %o3,16,%l5
616         stb     %l4,[%i1+12]
617         srl     %o3,8,%l6
618         stb     %l5,[%i1+13]
619         stb     %l6,[%i1+14]
620         stb     %o3,[%i1+15]
621
622         ret
623         restore
624 .type   AES_encrypt,#function
625 .size   AES_encrypt,(.-AES_encrypt)
626
627 ___
628
629 $code.=<<___;
630 .align  256
631 AES_Td:
632 ___
633 &_data_word(
634         0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
635         0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
636         0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
637         0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
638         0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
639         0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
640         0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
641         0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
642         0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
643         0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
644         0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
645         0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
646         0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
647         0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
648         0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
649         0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
650         0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
651         0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
652         0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
653         0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
654         0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
655         0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
656         0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
657         0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
658         0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
659         0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
660         0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
661         0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
662         0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
663         0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
664         0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
665         0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
666         0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
667         0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
668         0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
669         0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
670         0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
671         0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
672         0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
673         0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
674         0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
675         0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
676         0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
677         0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
678         0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
679         0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
680         0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
681         0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
682         0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
683         0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
684         0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
685         0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
686         0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
687         0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
688         0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
689         0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
690         0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
691         0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
692         0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
693         0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
694         0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
695         0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
696         0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
697         0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
698 $code.=<<___;
699         .byte   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
700         .byte   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
701         .byte   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
702         .byte   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
703         .byte   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
704         .byte   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
705         .byte   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
706         .byte   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
707         .byte   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
708         .byte   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
709         .byte   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
710         .byte   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
711         .byte   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
712         .byte   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
713         .byte   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
714         .byte   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
715         .byte   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
716         .byte   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
717         .byte   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
718         .byte   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
719         .byte   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
720         .byte   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
721         .byte   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
722         .byte   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
723         .byte   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
724         .byte   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
725         .byte   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
726         .byte   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
727         .byte   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
728         .byte   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
729         .byte   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
730         .byte   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
731 .type   AES_Td,#object
732 .size   AES_Td,(.-AES_Td)
733
734 .align  64
735 .skip   16
736 _sparcv9_AES_decrypt:
737         save    %sp,-$frame-$locals,%sp
738         stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
739         ld      [$key+240],$rounds
740         ld      [$key+0],$t0
741         ld      [$key+4],$t1                    !
742         ld      [$key+8],$t2
743         ld      [$key+12],$t3
744         srl     $rounds,1,$rounds
745         xor     $t0,$s0,$s0
746         ld      [$key+16],$t0
747         xor     $t1,$s1,$s1
748         ld      [$key+20],$t1
749         srl     $s0,21,$acc0                    !
750         xor     $t2,$s2,$s2
751         ld      [$key+24],$t2
752         xor     $t3,$s3,$s3
753         and     $acc0,2040,$acc0
754         ld      [$key+28],$t3
755         srl     $s3,13,$acc1
756         nop
757 .Ldec_loop:
758         srl     $s2,5,$acc2                     !
759         and     $acc1,2040,$acc1
760         ldx     [$tbl+$acc0],$acc0
761         sll     $s1,3,$acc3
762         and     $acc2,2040,$acc2
763         ldx     [$tbl+$acc1],$acc1
764         srl     $s1,21,$acc4
765         and     $acc3,2040,$acc3
766         ldx     [$tbl+$acc2],$acc2              !
767         srl     $s0,13,$acc5
768         and     $acc4,2040,$acc4
769         ldx     [$tbl+$acc3],$acc3
770         srl     $s3,5,$acc6
771         and     $acc5,2040,$acc5
772         ldx     [$tbl+$acc4],$acc4
773         fmovs   %f0,%f0
774         sll     $s2,3,$acc7                     !
775         and     $acc6,2040,$acc6
776         ldx     [$tbl+$acc5],$acc5
777         srl     $s2,21,$acc8
778         and     $acc7,2040,$acc7
779         ldx     [$tbl+$acc6],$acc6
780         srl     $s1,13,$acc9
781         and     $acc8,2040,$acc8
782         ldx     [$tbl+$acc7],$acc7              !
783         srl     $s0,5,$acc10
784         and     $acc9,2040,$acc9
785         ldx     [$tbl+$acc8],$acc8
786         sll     $s3,3,$acc11
787         and     $acc10,2040,$acc10
788         ldx     [$tbl+$acc9],$acc9
789         fmovs   %f0,%f0
790         srl     $s3,21,$acc12                   !
791         and     $acc11,2040,$acc11
792         ldx     [$tbl+$acc10],$acc10
793         srl     $s2,13,$acc13
794         and     $acc12,2040,$acc12
795         ldx     [$tbl+$acc11],$acc11
796         srl     $s1,5,$acc14
797         and     $acc13,2040,$acc13
798         ldx     [$tbl+$acc12],$acc12            !
799         sll     $s0,3,$acc15
800         and     $acc14,2040,$acc14
801         ldx     [$tbl+$acc13],$acc13
802         and     $acc15,2040,$acc15
803         add     $key,32,$key
804         ldx     [$tbl+$acc14],$acc14
805         fmovs   %f0,%f0
806         subcc   $rounds,1,$rounds               !
807         ldx     [$tbl+$acc15],$acc15
808         bz,a,pn %icc,.Ldec_last
809         add     $tbl,2048,$rounds
810
811                 srlx    $acc1,8,$acc1
812                 xor     $acc0,$t0,$t0
813         ld      [$key+0],$s0
814         fmovs   %f0,%f0
815                 srlx    $acc2,16,$acc2          !
816                 xor     $acc1,$t0,$t0
817         ld      [$key+4],$s1
818                 srlx    $acc3,24,$acc3
819                 xor     $acc2,$t0,$t0
820         ld      [$key+8],$s2
821                 srlx    $acc5,8,$acc5
822                 xor     $acc3,$t0,$t0
823         ld      [$key+12],$s3                   !
824                 srlx    $acc6,16,$acc6
825                 xor     $acc4,$t1,$t1
826         fmovs   %f0,%f0
827                 srlx    $acc7,24,$acc7
828                 xor     $acc5,$t1,$t1
829                 srlx    $acc9,8,$acc9
830                 xor     $acc6,$t1,$t1
831                 srlx    $acc10,16,$acc10        !
832                 xor     $acc7,$t1,$t1
833                 srlx    $acc11,24,$acc11
834                 xor     $acc8,$t2,$t2
835                 srlx    $acc13,8,$acc13
836                 xor     $acc9,$t2,$t2
837                 srlx    $acc14,16,$acc14
838                 xor     $acc10,$t2,$t2
839                 srlx    $acc15,24,$acc15        !
840                 xor     $acc11,$t2,$t2
841                 xor     $acc12,$acc14,$acc14
842                 xor     $acc13,$t3,$t3
843         srl     $t0,21,$acc0
844                 xor     $acc14,$t3,$t3
845                 xor     $acc15,$t3,$t3
846         srl     $t3,13,$acc1
847
848         and     $acc0,2040,$acc0                !
849         srl     $t2,5,$acc2
850         and     $acc1,2040,$acc1
851         ldx     [$tbl+$acc0],$acc0
852         sll     $t1,3,$acc3
853         and     $acc2,2040,$acc2
854         ldx     [$tbl+$acc1],$acc1
855         fmovs   %f0,%f0
856         srl     $t1,21,$acc4                    !
857         and     $acc3,2040,$acc3
858         ldx     [$tbl+$acc2],$acc2
859         srl     $t0,13,$acc5
860         and     $acc4,2040,$acc4
861         ldx     [$tbl+$acc3],$acc3
862         srl     $t3,5,$acc6
863         and     $acc5,2040,$acc5
864         ldx     [$tbl+$acc4],$acc4              !
865         sll     $t2,3,$acc7
866         and     $acc6,2040,$acc6
867         ldx     [$tbl+$acc5],$acc5
868         srl     $t2,21,$acc8
869         and     $acc7,2040,$acc7
870         ldx     [$tbl+$acc6],$acc6
871         fmovs   %f0,%f0
872         srl     $t1,13,$acc9                    !
873         and     $acc8,2040,$acc8
874         ldx     [$tbl+$acc7],$acc7
875         srl     $t0,5,$acc10
876         and     $acc9,2040,$acc9
877         ldx     [$tbl+$acc8],$acc8
878         sll     $t3,3,$acc11
879         and     $acc10,2040,$acc10
880         ldx     [$tbl+$acc9],$acc9              !
881         srl     $t3,21,$acc12
882         and     $acc11,2040,$acc11
883         ldx     [$tbl+$acc10],$acc10
884         srl     $t2,13,$acc13
885         and     $acc12,2040,$acc12
886         ldx     [$tbl+$acc11],$acc11
887         fmovs   %f0,%f0
888         srl     $t1,5,$acc14                    !
889         and     $acc13,2040,$acc13
890         ldx     [$tbl+$acc12],$acc12
891         sll     $t0,3,$acc15
892         and     $acc14,2040,$acc14
893         ldx     [$tbl+$acc13],$acc13
894                 srlx    $acc1,8,$acc1
895         and     $acc15,2040,$acc15
896         ldx     [$tbl+$acc14],$acc14            !
897
898                 srlx    $acc2,16,$acc2
899                 xor     $acc0,$s0,$s0
900         ldx     [$tbl+$acc15],$acc15
901                 srlx    $acc3,24,$acc3
902                 xor     $acc1,$s0,$s0
903         ld      [$key+16],$t0
904         fmovs   %f0,%f0
905                 srlx    $acc5,8,$acc5           !
906                 xor     $acc2,$s0,$s0
907         ld      [$key+20],$t1
908                 srlx    $acc6,16,$acc6
909                 xor     $acc3,$s0,$s0
910         ld      [$key+24],$t2
911                 srlx    $acc7,24,$acc7
912                 xor     $acc4,$s1,$s1
913         ld      [$key+28],$t3                   !
914                 srlx    $acc9,8,$acc9
915                 xor     $acc5,$s1,$s1
916         ldx     [$tbl+2048+0],%g0               ! prefetch td4
917                 srlx    $acc10,16,$acc10
918                 xor     $acc6,$s1,$s1
919         ldx     [$tbl+2048+32],%g0              ! prefetch td4
920                 srlx    $acc11,24,$acc11
921                 xor     $acc7,$s1,$s1
922         ldx     [$tbl+2048+64],%g0              ! prefetch td4
923                 srlx    $acc13,8,$acc13
924                 xor     $acc8,$s2,$s2
925         ldx     [$tbl+2048+96],%g0              ! prefetch td4
926                 srlx    $acc14,16,$acc14        !
927                 xor     $acc9,$s2,$s2
928         ldx     [$tbl+2048+128],%g0             ! prefetch td4
929                 srlx    $acc15,24,$acc15
930                 xor     $acc10,$s2,$s2
931         ldx     [$tbl+2048+160],%g0             ! prefetch td4
932         srl     $s0,21,$acc0
933                 xor     $acc11,$s2,$s2
934         ldx     [$tbl+2048+192],%g0             ! prefetch td4
935                 xor     $acc12,$acc14,$acc14
936                 xor     $acc13,$s3,$s3
937         ldx     [$tbl+2048+224],%g0             ! prefetch td4
938         and     $acc0,2040,$acc0                !
939                 xor     $acc14,$s3,$s3
940                 xor     $acc15,$s3,$s3
941         ba      .Ldec_loop
942         srl     $s3,13,$acc1
943
944 .align  32
945 .Ldec_last:
946                 srlx    $acc1,8,$acc1           !
947                 xor     $acc0,$t0,$t0
948         ld      [$key+0],$s0
949                 srlx    $acc2,16,$acc2
950                 xor     $acc1,$t0,$t0
951         ld      [$key+4],$s1
952                 srlx    $acc3,24,$acc3
953                 xor     $acc2,$t0,$t0
954         ld      [$key+8],$s2                    !
955                 srlx    $acc5,8,$acc5
956                 xor     $acc3,$t0,$t0
957         ld      [$key+12],$s3
958                 srlx    $acc6,16,$acc6
959                 xor     $acc4,$t1,$t1
960                 srlx    $acc7,24,$acc7
961                 xor     $acc5,$t1,$t1
962                 srlx    $acc9,8,$acc9           !
963                 xor     $acc6,$t1,$t1
964                 srlx    $acc10,16,$acc10
965                 xor     $acc7,$t1,$t1
966                 srlx    $acc11,24,$acc11
967                 xor     $acc8,$t2,$t2
968                 srlx    $acc13,8,$acc13
969                 xor     $acc9,$t2,$t2
970                 srlx    $acc14,16,$acc14        !
971                 xor     $acc10,$t2,$t2
972                 srlx    $acc15,24,$acc15
973                 xor     $acc11,$t2,$t2
974                 xor     $acc12,$acc14,$acc14
975                 xor     $acc13,$t3,$t3
976         srl     $t0,24,$acc0
977                 xor     $acc14,$t3,$t3
978                 xor     $acc15,$t3,$t3          !
979         srl     $t3,16,$acc1
980
981         srl     $t2,8,$acc2
982         and     $acc1,255,$acc1
983         ldub    [$rounds+$acc0],$acc0
984         srl     $t1,24,$acc4
985         and     $acc2,255,$acc2
986         ldub    [$rounds+$acc1],$acc1
987         srl     $t0,16,$acc5                    !
988         and     $t1,255,$acc3
989         ldub    [$rounds+$acc2],$acc2
990         ldub    [$rounds+$acc3],$acc3
991         srl     $t3,8,$acc6
992         and     $acc5,255,$acc5
993         ldub    [$rounds+$acc4],$acc4
994         fmovs   %f0,%f0
995         srl     $t2,24,$acc8                    !
996         and     $acc6,255,$acc6
997         ldub    [$rounds+$acc5],$acc5
998         srl     $t1,16,$acc9
999         and     $t2,255,$acc7
1000         ldub    [$rounds+$acc6],$acc6
1001         ldub    [$rounds+$acc7],$acc7
1002         fmovs   %f0,%f0
1003         srl     $t0,8,$acc10                    !
1004         and     $acc9,255,$acc9
1005         ldub    [$rounds+$acc8],$acc8
1006         srl     $t3,24,$acc12
1007         and     $acc10,255,$acc10
1008         ldub    [$rounds+$acc9],$acc9
1009         srl     $t2,16,$acc13
1010         and     $t3,255,$acc11
1011         ldub    [$rounds+$acc10],$acc10         !
1012         srl     $t1,8,$acc14
1013         and     $acc13,255,$acc13
1014         ldub    [$rounds+$acc11],$acc11
1015         ldub    [$rounds+$acc12],$acc12
1016         and     $acc14,255,$acc14
1017         ldub    [$rounds+$acc13],$acc13
1018         and     $t0,255,$acc15
1019         ldub    [$rounds+$acc14],$acc14         !
1020
1021                 sll     $acc0,24,$acc0
1022                 xor     $acc3,$s0,$s0
1023         ldub    [$rounds+$acc15],$acc15
1024                 sll     $acc1,16,$acc1
1025                 xor     $acc0,$s0,$s0
1026         ldx     [%sp+$bias+$frame+0],%i7        ! restore return address
1027         fmovs   %f0,%f0
1028                 sll     $acc2,8,$acc2           !
1029                 xor     $acc1,$s0,$s0
1030                 sll     $acc4,24,$acc4
1031                 xor     $acc2,$s0,$s0
1032                 sll     $acc5,16,$acc5
1033                 xor     $acc7,$s1,$s1
1034                 sll     $acc6,8,$acc6
1035                 xor     $acc4,$s1,$s1
1036                 sll     $acc8,24,$acc8          !
1037                 xor     $acc5,$s1,$s1
1038                 sll     $acc9,16,$acc9
1039                 xor     $acc11,$s2,$s2
1040                 sll     $acc10,8,$acc10
1041                 xor     $acc6,$s1,$s1
1042                 sll     $acc12,24,$acc12
1043                 xor     $acc8,$s2,$s2
1044                 sll     $acc13,16,$acc13        !
1045                 xor     $acc9,$s2,$s2
1046                 sll     $acc14,8,$acc14
1047                 xor     $acc10,$s2,$s2
1048                 xor     $acc12,$acc14,$acc14
1049                 xor     $acc13,$s3,$s3
1050                 xor     $acc14,$s3,$s3
1051                 xor     $acc15,$s3,$s3
1052
1053         ret
1054         restore
1055 .type   _sparcv9_AES_decrypt,#function
1056 .size   _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1057
1058 .align  32
1059 .globl  AES_decrypt
1060 AES_decrypt:
1061         or      %o0,%o1,%g1
1062         andcc   %g1,3,%g0
1063         bnz,pn  %xcc,.Lunaligned_dec
1064         save    %sp,-$frame,%sp
1065
1066         ld      [%i0+0],%o0
1067         ld      [%i0+4],%o1
1068         ld      [%i0+8],%o2
1069         ld      [%i0+12],%o3
1070
1071 1:      call    .+8
1072         sub     %o7,1b-AES_Td,%o4
1073         call    _sparcv9_AES_decrypt
1074         mov     %i2,%o5
1075
1076         st      %o0,[%i1+0]
1077         st      %o1,[%i1+4]
1078         st      %o2,[%i1+8]
1079         st      %o3,[%i1+12]
1080
1081         ret
1082         restore
1083
1084 .align  32
1085 .Lunaligned_dec:
1086         ldub    [%i0+0],%l0
1087         ldub    [%i0+1],%l1
1088         ldub    [%i0+2],%l2
1089
1090         sll     %l0,24,%l0
1091         ldub    [%i0+3],%l3
1092         sll     %l1,16,%l1
1093         ldub    [%i0+4],%l4
1094         sll     %l2,8,%l2
1095         or      %l1,%l0,%l0
1096         ldub    [%i0+5],%l5
1097         sll     %l4,24,%l4
1098         or      %l3,%l2,%l2
1099         ldub    [%i0+6],%l6
1100         sll     %l5,16,%l5
1101         or      %l0,%l2,%o0
1102         ldub    [%i0+7],%l7
1103
1104         sll     %l6,8,%l6
1105         or      %l5,%l4,%l4
1106         ldub    [%i0+8],%l0
1107         or      %l7,%l6,%l6
1108         ldub    [%i0+9],%l1
1109         or      %l4,%l6,%o1
1110         ldub    [%i0+10],%l2
1111
1112         sll     %l0,24,%l0
1113         ldub    [%i0+11],%l3
1114         sll     %l1,16,%l1
1115         ldub    [%i0+12],%l4
1116         sll     %l2,8,%l2
1117         or      %l1,%l0,%l0
1118         ldub    [%i0+13],%l5
1119         sll     %l4,24,%l4
1120         or      %l3,%l2,%l2
1121         ldub    [%i0+14],%l6
1122         sll     %l5,16,%l5
1123         or      %l0,%l2,%o2
1124         ldub    [%i0+15],%l7
1125
1126         sll     %l6,8,%l6
1127         or      %l5,%l4,%l4
1128         or      %l7,%l6,%l6
1129         or      %l4,%l6,%o3
1130
1131 1:      call    .+8
1132         sub     %o7,1b-AES_Td,%o4
1133         call    _sparcv9_AES_decrypt
1134         mov     %i2,%o5
1135
1136         srl     %o0,24,%l0
1137         srl     %o0,16,%l1
1138         stb     %l0,[%i1+0]
1139         srl     %o0,8,%l2
1140         stb     %l1,[%i1+1]
1141         stb     %l2,[%i1+2]
1142         srl     %o1,24,%l4
1143         stb     %o0,[%i1+3]
1144
1145         srl     %o1,16,%l5
1146         stb     %l4,[%i1+4]
1147         srl     %o1,8,%l6
1148         stb     %l5,[%i1+5]
1149         stb     %l6,[%i1+6]
1150         srl     %o2,24,%l0
1151         stb     %o1,[%i1+7]
1152
1153         srl     %o2,16,%l1
1154         stb     %l0,[%i1+8]
1155         srl     %o2,8,%l2
1156         stb     %l1,[%i1+9]
1157         stb     %l2,[%i1+10]
1158         srl     %o3,24,%l4
1159         stb     %o2,[%i1+11]
1160
1161         srl     %o3,16,%l5
1162         stb     %l4,[%i1+12]
1163         srl     %o3,8,%l6
1164         stb     %l5,[%i1+13]
1165         stb     %l6,[%i1+14]
1166         stb     %o3,[%i1+15]
1167
1168         ret
1169         restore
1170 .type   AES_decrypt,#function
1171 .size   AES_decrypt,(.-AES_decrypt)
1172 ___
1173
1174 # fmovs instructions substituting for FP nops were originally added
1175 # to meet specific instruction alignment requirements to maximize ILP.
1176 # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1177 # undesired effect, so just omit them and sacrifice some portion of
1178 # percent in performance...
1179 $code =~ s/fmovs.*$//gem;
1180
1181 print $code;