c04125e28b657ce2af33d3a5d17d49a8c91c4c02
[openssl.git] / crypto / aes / asm / aes-sparcv9.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
8 #
9 # Version 1.0
10 #
11 # The major reason for undertaken effort was to mitigate the hazard of
12 # cache-timing attack. This is [currently and initially!] addressed in
13 # two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
14 # 2. References to them are scheduled for L2 cache latency, meaning
15 # that the tables don't have to reside in L1 cache. Once again, this
16 # is an initial draft and one should expect more countermeasures to
17 # be implemented...
18 #
19 # Even though performance was not the primary goal [on the contrary,
20 # extra shifts "induced" by compressed S-box and longer loop epilogue
21 # "induced" by scheduling for L2 have negative effect on performance],
22 # the code turned out to run in ~23 cycles per processed byte en-/
23 # decrypted with 128-bit key. This is pretty good result for code
24 # with mentioned qualities and UltraSPARC core. Compared to Sun C
25 # generated code my encrypt procedure runs just few percents faster,
26 # while decrypt one - whole 50% faster [yes, Sun C failed to generate
27 # optimal decrypt procedure]. Compared to GNU C generated code both
28 # procedures are more than 60% faster:-)
29
30 $bits=32;
31 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
32 if ($bits==64)  { $bias=2047; $frame=192; }
33 else            { $bias=0;    $frame=112; }
34 $locals=16;
35
36 $acc0="%l0";
37 $acc1="%o0";
38 $acc2="%o1";
39 $acc3="%o2";
40
41 $acc4="%l1";
42 $acc5="%o3";
43 $acc6="%o4";
44 $acc7="%o5";
45
46 $acc8="%l2";
47 $acc9="%o7";
48 $acc10="%g1";
49 $acc11="%g2";
50
51 $acc12="%l3";
52 $acc13="%g3";
53 $acc14="%g4";
54 $acc15="%g5";
55
56 $t0="%l4";
57 $t1="%l5";
58 $t2="%l6";
59 $t3="%l7";
60
61 $s0="%i0";
62 $s1="%i1";
63 $s2="%i2";
64 $s3="%i3";
65 $tbl="%i4";
66 $key="%i5";
67 $rounds="%i7";  # aliases with return address, which is off-loaded to stack
68
69 sub _data_word()
70 { my $i;
71     while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
72 }
73
74 $code.=<<___ if ($bits==64);
75 .register       %g2,#scratch
76 .register       %g3,#scratch
77 ___
78 $code.=<<___;
79 .section        ".text",#alloc,#execinstr
80
81 .align  64
82 AES_Te:
83 ___
84 &_data_word(
85         0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
86         0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
87         0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
88         0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
89         0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
90         0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
91         0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
92         0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
93         0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
94         0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
95         0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
96         0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
97         0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
98         0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
99         0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
100         0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
101         0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
102         0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
103         0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
104         0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
105         0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
106         0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
107         0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
108         0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
109         0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
110         0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
111         0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
112         0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
113         0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
114         0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
115         0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
116         0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
117         0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
118         0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
119         0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
120         0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
121         0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
122         0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
123         0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
124         0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
125         0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
126         0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
127         0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
128         0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
129         0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
130         0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
131         0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
132         0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
133         0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
134         0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
135         0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
136         0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
137         0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
138         0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
139         0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
140         0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
141         0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
142         0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
143         0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
144         0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
145         0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
146         0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
147         0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
148         0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
149 $code.=<<___;
150         .byte   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
151         .byte   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
152         .byte   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
153         .byte   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
154         .byte   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
155         .byte   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
156         .byte   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
157         .byte   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
158         .byte   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
159         .byte   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
160         .byte   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
161         .byte   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
162         .byte   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
163         .byte   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
164         .byte   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
165         .byte   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
166         .byte   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
167         .byte   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
168         .byte   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
169         .byte   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
170         .byte   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
171         .byte   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
172         .byte   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
173         .byte   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
174         .byte   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
175         .byte   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
176         .byte   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
177         .byte   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
178         .byte   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
179         .byte   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
180         .byte   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
181         .byte   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
182 .type   AES_Te,#object
183 .size   AES_Te,(.-AES_Te)
184
185 .align  64
186 .skip   16
187 _sparcv9_AES_encrypt:
188         save    %sp,-$frame-$locals,%sp
189         stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
190         ld      [$key+240],$rounds
191         ld      [$key+0],$t0
192         ld      [$key+4],$t1                    !
193         ld      [$key+8],$t2
194         srl     $rounds,1,$rounds
195         xor     $t0,$s0,$s0
196         ld      [$key+12],$t3
197         srl     $s0,21,$acc0
198         xor     $t1,$s1,$s1
199         ld      [$key+16],$t0
200         srl     $s1,13,$acc1                    !
201         xor     $t2,$s2,$s2
202         ld      [$key+20],$t1
203         xor     $t3,$s3,$s3
204         ld      [$key+24],$t2
205         and     $acc0,2040,$acc0
206         ld      [$key+28],$t3
207         nop
208 .Lenc_loop:
209         srl     $s2,5,$acc2                     !
210         and     $acc1,2040,$acc1
211         ldx     [$tbl+$acc0],$acc0
212         sll     $s3,3,$acc3
213         and     $acc2,2040,$acc2
214         ldx     [$tbl+$acc1],$acc1
215         srl     $s1,21,$acc4
216         and     $acc3,2040,$acc3
217         ldx     [$tbl+$acc2],$acc2              !
218         srl     $s2,13,$acc5
219         and     $acc4,2040,$acc4
220         ldx     [$tbl+$acc3],$acc3
221         srl     $s3,5,$acc6
222         and     $acc5,2040,$acc5
223         ldx     [$tbl+$acc4],$acc4
224         fmovs   %f0,%f0
225         sll     $s0,3,$acc7                     !
226         and     $acc6,2040,$acc6
227         ldx     [$tbl+$acc5],$acc5
228         srl     $s2,21,$acc8
229         and     $acc7,2040,$acc7
230         ldx     [$tbl+$acc6],$acc6
231         srl     $s3,13,$acc9
232         and     $acc8,2040,$acc8
233         ldx     [$tbl+$acc7],$acc7              !
234         srl     $s0,5,$acc10
235         and     $acc9,2040,$acc9
236         ldx     [$tbl+$acc8],$acc8
237         sll     $s1,3,$acc11
238         and     $acc10,2040,$acc10
239         ldx     [$tbl+$acc9],$acc9
240         fmovs   %f0,%f0
241         srl     $s3,21,$acc12                   !
242         and     $acc11,2040,$acc11
243         ldx     [$tbl+$acc10],$acc10
244         srl     $s0,13,$acc13
245         and     $acc12,2040,$acc12
246         ldx     [$tbl+$acc11],$acc11
247         srl     $s1,5,$acc14
248         and     $acc13,2040,$acc13
249         ldx     [$tbl+$acc12],$acc12            !
250         sll     $s2,3,$acc15
251         and     $acc14,2040,$acc14
252         ldx     [$tbl+$acc13],$acc13
253         and     $acc15,2040,$acc15
254         add     $key,32,$key
255         ldx     [$tbl+$acc14],$acc14
256         fmovs   %f0,%f0
257         subcc   $rounds,1,$rounds               !
258         ldx     [$tbl+$acc15],$acc15
259         bz,a,pn %icc,.Lenc_last
260         add     $tbl,2048,$rounds
261
262                 srlx    $acc1,8,$acc1
263                 xor     $acc0,$t0,$t0
264         ld      [$key+0],$s0
265         fmovs   %f0,%f0
266                 srlx    $acc2,16,$acc2          !
267                 xor     $acc1,$t0,$t0
268         ld      [$key+4],$s1
269                 srlx    $acc3,24,$acc3
270                 xor     $acc2,$t0,$t0
271         ld      [$key+8],$s2
272                 srlx    $acc5,8,$acc5
273                 xor     $acc3,$t0,$t0
274         ld      [$key+12],$s3                   !
275                 srlx    $acc6,16,$acc6
276                 xor     $acc4,$t1,$t1
277         fmovs   %f0,%f0
278                 srlx    $acc7,24,$acc7
279                 xor     $acc5,$t1,$t1
280                 srlx    $acc9,8,$acc9
281                 xor     $acc6,$t1,$t1
282                 srlx    $acc10,16,$acc10        !
283                 xor     $acc7,$t1,$t1
284                 srlx    $acc11,24,$acc11
285                 xor     $acc8,$t2,$t2
286                 srlx    $acc13,8,$acc13
287                 xor     $acc9,$t2,$t2
288                 srlx    $acc14,16,$acc14
289                 xor     $acc10,$t2,$t2
290                 srlx    $acc15,24,$acc15        !
291                 xor     $acc11,$t2,$t2
292                 xor     $acc12,$acc14,$acc14
293                 xor     $acc13,$t3,$t3
294         srl     $t0,21,$acc0
295                 xor     $acc14,$t3,$t3
296         srl     $t1,13,$acc1
297                 xor     $acc15,$t3,$t3
298
299         and     $acc0,2040,$acc0                !
300         srl     $t2,5,$acc2
301         and     $acc1,2040,$acc1
302         ldx     [$tbl+$acc0],$acc0
303         sll     $t3,3,$acc3
304         and     $acc2,2040,$acc2
305         ldx     [$tbl+$acc1],$acc1
306         fmovs   %f0,%f0
307         srl     $t1,21,$acc4                    !
308         and     $acc3,2040,$acc3
309         ldx     [$tbl+$acc2],$acc2
310         srl     $t2,13,$acc5
311         and     $acc4,2040,$acc4
312         ldx     [$tbl+$acc3],$acc3
313         srl     $t3,5,$acc6
314         and     $acc5,2040,$acc5
315         ldx     [$tbl+$acc4],$acc4              !
316         sll     $t0,3,$acc7
317         and     $acc6,2040,$acc6
318         ldx     [$tbl+$acc5],$acc5
319         srl     $t2,21,$acc8
320         and     $acc7,2040,$acc7
321         ldx     [$tbl+$acc6],$acc6
322         fmovs   %f0,%f0
323         srl     $t3,13,$acc9                    !
324         and     $acc8,2040,$acc8
325         ldx     [$tbl+$acc7],$acc7
326         srl     $t0,5,$acc10
327         and     $acc9,2040,$acc9
328         ldx     [$tbl+$acc8],$acc8
329         sll     $t1,3,$acc11
330         and     $acc10,2040,$acc10
331         ldx     [$tbl+$acc9],$acc9              !
332         srl     $t3,21,$acc12
333         and     $acc11,2040,$acc11
334         ldx     [$tbl+$acc10],$acc10
335         srl     $t0,13,$acc13
336         and     $acc12,2040,$acc12
337         ldx     [$tbl+$acc11],$acc11
338         fmovs   %f0,%f0
339         srl     $t1,5,$acc14                    !
340         and     $acc13,2040,$acc13
341         ldx     [$tbl+$acc12],$acc12
342         sll     $t2,3,$acc15
343         and     $acc14,2040,$acc14
344         ldx     [$tbl+$acc13],$acc13
345                 srlx    $acc1,8,$acc1
346         and     $acc15,2040,$acc15
347         ldx     [$tbl+$acc14],$acc14            !
348
349                 srlx    $acc2,16,$acc2
350                 xor     $acc0,$s0,$s0
351         ldx     [$tbl+$acc15],$acc15
352                 srlx    $acc3,24,$acc3
353                 xor     $acc1,$s0,$s0
354         ld      [$key+16],$t0
355         fmovs   %f0,%f0
356                 srlx    $acc5,8,$acc5           !
357                 xor     $acc2,$s0,$s0
358         ld      [$key+20],$t1
359                 srlx    $acc6,16,$acc6
360                 xor     $acc3,$s0,$s0
361         ld      [$key+24],$t2
362                 srlx    $acc7,24,$acc7
363                 xor     $acc4,$s1,$s1
364         ld      [$key+28],$t3                   !
365                 srlx    $acc9,8,$acc9
366                 xor     $acc5,$s1,$s1
367                 srlx    $acc10,16,$acc10
368                 xor     $acc6,$s1,$s1
369                 srlx    $acc11,24,$acc11
370                 xor     $acc7,$s1,$s1
371                 srlx    $acc13,8,$acc13
372                 xor     $acc8,$s2,$s2
373                 srlx    $acc14,16,$acc14        !
374                 xor     $acc9,$s2,$s2
375                 srlx    $acc15,24,$acc15
376                 xor     $acc10,$s2,$s2
377         srl     $s0,21,$acc0
378                 xor     $acc11,$s2,$s2
379                 xor     $acc12,$acc14,$acc14
380                 xor     $acc13,$s3,$s3
381         srl     $s1,13,$acc1                    !
382                 xor     $acc14,$s3,$s3
383                 xor     $acc15,$s3,$s3
384         ba      .Lenc_loop
385         and     $acc0,2040,$acc0
386
387 .align  32
388 .Lenc_last:
389                 srlx    $acc1,8,$acc1           !
390                 xor     $acc0,$t0,$t0
391         ld      [$key+0],$s0
392                 srlx    $acc2,16,$acc2
393                 xor     $acc1,$t0,$t0
394         ld      [$key+4],$s1
395                 srlx    $acc3,24,$acc3
396                 xor     $acc2,$t0,$t0
397         ld      [$key+8],$s2                    !
398                 srlx    $acc5,8,$acc5
399                 xor     $acc3,$t0,$t0
400         ld      [$key+12],$s3
401                 srlx    $acc6,16,$acc6
402                 xor     $acc4,$t1,$t1
403                 srlx    $acc7,24,$acc7
404                 xor     $acc5,$t1,$t1
405                 srlx    $acc9,8,$acc9           !
406                 xor     $acc6,$t1,$t1
407                 srlx    $acc10,16,$acc10
408                 xor     $acc7,$t1,$t1
409                 srlx    $acc11,24,$acc11
410                 xor     $acc8,$t2,$t2
411                 srlx    $acc13,8,$acc13
412                 xor     $acc9,$t2,$t2
413                 srlx    $acc14,16,$acc14        !
414                 xor     $acc10,$t2,$t2
415                 srlx    $acc15,24,$acc15
416                 xor     $acc11,$t2,$t2
417                 xor     $acc12,$acc14,$acc14
418                 xor     $acc13,$t3,$t3
419         srl     $t0,24,$acc0
420                 xor     $acc14,$t3,$t3
421         srl     $t1,16,$acc1                    !
422                 xor     $acc15,$t3,$t3
423
424         srl     $t2,8,$acc2
425         and     $acc1,255,$acc1
426         ldub    [$rounds+$acc0],$acc0
427         srl     $t1,24,$acc4
428         and     $acc2,255,$acc2
429         ldub    [$rounds+$acc1],$acc1
430         srl     $t2,16,$acc5                    !
431         and     $t3,255,$acc3
432         ldub    [$rounds+$acc2],$acc2
433         ldub    [$rounds+$acc3],$acc3
434         srl     $t3,8,$acc6
435         and     $acc5,255,$acc5
436         ldub    [$rounds+$acc4],$acc4
437         fmovs   %f0,%f0
438         srl     $t2,24,$acc8                    !
439         and     $acc6,255,$acc6
440         ldub    [$rounds+$acc5],$acc5
441         srl     $t3,16,$acc9
442         and     $t0,255,$acc7
443         ldub    [$rounds+$acc6],$acc6
444         ldub    [$rounds+$acc7],$acc7
445         fmovs   %f0,%f0
446         srl     $t0,8,$acc10                    !
447         and     $acc9,255,$acc9
448         ldub    [$rounds+$acc8],$acc8
449         srl     $t3,24,$acc12
450         and     $acc10,255,$acc10
451         ldub    [$rounds+$acc9],$acc9
452         srl     $t0,16,$acc13
453         and     $t1,255,$acc11
454         ldub    [$rounds+$acc10],$acc10         !
455         srl     $t1,8,$acc14
456         and     $acc13,255,$acc13
457         ldub    [$rounds+$acc11],$acc11
458         ldub    [$rounds+$acc12],$acc12
459         and     $acc14,255,$acc14
460         ldub    [$rounds+$acc13],$acc13
461         and     $t2,255,$acc15
462         ldub    [$rounds+$acc14],$acc14         !
463
464                 sll     $acc0,24,$acc0
465                 xor     $acc3,$s0,$s0
466         ldub    [$rounds+$acc15],$acc15
467                 sll     $acc1,16,$acc1
468                 xor     $acc0,$s0,$s0
469         ldx     [%sp+$bias+$frame+0],%i7        ! restore return address
470         fmovs   %f0,%f0
471                 sll     $acc2,8,$acc2           !
472                 xor     $acc1,$s0,$s0
473                 sll     $acc4,24,$acc4
474                 xor     $acc2,$s0,$s0
475                 sll     $acc5,16,$acc5
476                 xor     $acc7,$s1,$s1
477                 sll     $acc6,8,$acc6
478                 xor     $acc4,$s1,$s1
479                 sll     $acc8,24,$acc8          !
480                 xor     $acc5,$s1,$s1
481                 sll     $acc9,16,$acc9
482                 xor     $acc11,$s2,$s2
483                 sll     $acc10,8,$acc10
484                 xor     $acc6,$s1,$s1
485                 sll     $acc12,24,$acc12
486                 xor     $acc8,$s2,$s2
487                 sll     $acc13,16,$acc13        !
488                 xor     $acc9,$s2,$s2
489                 sll     $acc14,8,$acc14
490                 xor     $acc10,$s2,$s2
491                 xor     $acc12,$acc14,$acc14
492                 xor     $acc13,$s3,$s3
493                 xor     $acc14,$s3,$s3
494                 xor     $acc15,$s3,$s3
495
496         ret
497         restore
498 .type   _sparcv9_AES_encrypt,#function
499 .size   _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
500
501 .align  32
502 .globl  AES_encrypt
503 AES_encrypt:
504         or      %o0,%o1,%g1
505         andcc   %g1,3,%g0
506         bnz,pn  %xcc,.Lunaligned_enc
507         save    %sp,-$frame,%sp
508
509         ld      [%i0+0],%o0
510         ld      [%i0+4],%o1
511         ld      [%i0+8],%o2
512         ld      [%i0+12],%o3
513
514         mov     %i2,%o5
515         nop
516 1:      call    _sparcv9_AES_encrypt
517         sub     %o7,1b-AES_Te,%o4
518
519         st      %o0,[%i1+0]
520         st      %o1,[%i1+4]
521         st      %o2,[%i1+8]
522         st      %o3,[%i1+12]
523
524         ret
525         restore
526
527 .align  32
528 .Lunaligned_enc:
529         ldub    [%i0+0],%l0
530         ldub    [%i0+1],%l1
531         ldub    [%i0+2],%l2
532
533         sll     %l0,24,%l0
534         ldub    [%i0+3],%l3
535         sll     %l1,16,%l1
536         ldub    [%i0+4],%l4
537         sll     %l2,8,%l2
538         or      %l1,%l0,%l0
539         ldub    [%i0+5],%l5
540         sll     %l4,24,%l4
541         or      %l3,%l2,%l2
542         ldub    [%i0+6],%l6
543         sll     %l5,16,%l5
544         or      %l0,%l2,%o0
545         ldub    [%i0+7],%l7
546
547         sll     %l6,8,%l6
548         or      %l5,%l4,%l4
549         ldub    [%i0+8],%l0
550         or      %l7,%l6,%l6
551         ldub    [%i0+9],%l1
552         or      %l4,%l6,%o1
553         ldub    [%i0+10],%l2
554
555         sll     %l0,24,%l0
556         ldub    [%i0+11],%l3
557         sll     %l1,16,%l1
558         ldub    [%i0+12],%l4
559         sll     %l2,8,%l2
560         or      %l1,%l0,%l0
561         ldub    [%i0+13],%l5
562         sll     %l4,24,%l4
563         or      %l3,%l2,%l2
564         ldub    [%i0+14],%l6
565         sll     %l5,16,%l5
566         or      %l0,%l2,%o2
567         ldub    [%i0+15],%l7
568
569         sll     %l6,8,%l6
570         or      %l5,%l4,%l4
571         or      %l7,%l6,%l6
572         or      %l4,%l6,%o3
573
574         mov     %i2,%o5
575         nop
576 1:      call    _sparcv9_AES_encrypt
577         sub     %o7,1b-AES_Te,%o4
578
579         srl     %o0,24,%l0
580         srl     %o0,16,%l1
581         stb     %l0,[%i1+0]
582         srl     %o0,8,%l2
583         stb     %l1,[%i1+1]
584         stb     %l2,[%i1+2]
585         srl     %o1,24,%l4
586         stb     %o0,[%i1+3]
587
588         srl     %o1,16,%l5
589         stb     %l4,[%i1+4]
590         srl     %o1,8,%l6
591         stb     %l5,[%i1+5]
592         stb     %l6,[%i1+6]
593         srl     %o2,24,%l0
594         stb     %o1,[%i1+7]
595
596         srl     %o2,16,%l1
597         stb     %l0,[%i1+8]
598         srl     %o2,8,%l2
599         stb     %l1,[%i1+9]
600         stb     %l2,[%i1+10]
601         srl     %o3,24,%l4
602         stb     %o2,[%i1+11]
603
604         srl     %o3,16,%l5
605         stb     %l4,[%i1+12]
606         srl     %o3,8,%l6
607         stb     %l5,[%i1+13]
608         stb     %l6,[%i1+14]
609         stb     %o3,[%i1+15]
610
611         ret
612         restore
613 .type   AES_encrypt,#function
614 .size   AES_encrypt,(.-AES_encrypt)
615
616 ___
617
618 $code.=<<___;
619 .align  64
620 AES_Td:
621 ___
622 &_data_word(
623         0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
624         0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
625         0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
626         0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
627         0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
628         0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
629         0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
630         0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
631         0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
632         0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
633         0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
634         0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
635         0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
636         0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
637         0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
638         0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
639         0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
640         0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
641         0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
642         0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
643         0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
644         0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
645         0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
646         0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
647         0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
648         0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
649         0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
650         0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
651         0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
652         0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
653         0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
654         0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
655         0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
656         0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
657         0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
658         0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
659         0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
660         0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
661         0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
662         0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
663         0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
664         0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
665         0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
666         0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
667         0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
668         0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
669         0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
670         0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
671         0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
672         0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
673         0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
674         0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
675         0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
676         0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
677         0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
678         0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
679         0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
680         0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
681         0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
682         0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
683         0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
684         0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
685         0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
686         0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
687 $code.=<<___;
688         .byte   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
689         .byte   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
690         .byte   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
691         .byte   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
692         .byte   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
693         .byte   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
694         .byte   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
695         .byte   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
696         .byte   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
697         .byte   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
698         .byte   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
699         .byte   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
700         .byte   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
701         .byte   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
702         .byte   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
703         .byte   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
704         .byte   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
705         .byte   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
706         .byte   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
707         .byte   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
708         .byte   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
709         .byte   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
710         .byte   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
711         .byte   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
712         .byte   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
713         .byte   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
714         .byte   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
715         .byte   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
716         .byte   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
717         .byte   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
718         .byte   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
719         .byte   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
720 .type   AES_Td,#object
721 .size   AES_Td,(.-AES_Td)
722
723 .align  64
724 .skip   16
725 _sparcv9_AES_decrypt:
726         save    %sp,-$frame-$locals,%sp
727         stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
728         ld      [$key+240],$rounds
729         ld      [$key+0],$t0
730         ld      [$key+4],$t1                    !
731         ld      [$key+8],$t2
732         ld      [$key+12],$t3
733         srl     $rounds,1,$rounds
734         xor     $t0,$s0,$s0
735         ld      [$key+16],$t0
736         xor     $t1,$s1,$s1
737         ld      [$key+20],$t1
738         srl     $s0,21,$acc0                    !
739         xor     $t2,$s2,$s2
740         ld      [$key+24],$t2
741         xor     $t3,$s3,$s3
742         and     $acc0,2040,$acc0
743         ld      [$key+28],$t3
744         srl     $s3,13,$acc1
745         nop
746 .Ldec_loop:
747         srl     $s2,5,$acc2                     !
748         and     $acc1,2040,$acc1
749         ldx     [$tbl+$acc0],$acc0
750         sll     $s1,3,$acc3
751         and     $acc2,2040,$acc2
752         ldx     [$tbl+$acc1],$acc1
753         srl     $s1,21,$acc4
754         and     $acc3,2040,$acc3
755         ldx     [$tbl+$acc2],$acc2              !
756         srl     $s0,13,$acc5
757         and     $acc4,2040,$acc4
758         ldx     [$tbl+$acc3],$acc3
759         srl     $s3,5,$acc6
760         and     $acc5,2040,$acc5
761         ldx     [$tbl+$acc4],$acc4
762         fmovs   %f0,%f0
763         sll     $s2,3,$acc7                     !
764         and     $acc6,2040,$acc6
765         ldx     [$tbl+$acc5],$acc5
766         srl     $s2,21,$acc8
767         and     $acc7,2040,$acc7
768         ldx     [$tbl+$acc6],$acc6
769         srl     $s1,13,$acc9
770         and     $acc8,2040,$acc8
771         ldx     [$tbl+$acc7],$acc7              !
772         srl     $s0,5,$acc10
773         and     $acc9,2040,$acc9
774         ldx     [$tbl+$acc8],$acc8
775         sll     $s3,3,$acc11
776         and     $acc10,2040,$acc10
777         ldx     [$tbl+$acc9],$acc9
778         fmovs   %f0,%f0
779         srl     $s3,21,$acc12                   !
780         and     $acc11,2040,$acc11
781         ldx     [$tbl+$acc10],$acc10
782         srl     $s2,13,$acc13
783         and     $acc12,2040,$acc12
784         ldx     [$tbl+$acc11],$acc11
785         srl     $s1,5,$acc14
786         and     $acc13,2040,$acc13
787         ldx     [$tbl+$acc12],$acc12            !
788         sll     $s0,3,$acc15
789         and     $acc14,2040,$acc14
790         ldx     [$tbl+$acc13],$acc13
791         and     $acc15,2040,$acc15
792         add     $key,32,$key
793         ldx     [$tbl+$acc14],$acc14
794         fmovs   %f0,%f0
795         subcc   $rounds,1,$rounds               !
796         ldx     [$tbl+$acc15],$acc15
797         bz,a,pn %icc,.Ldec_last
798         add     $tbl,2048,$rounds
799
800                 srlx    $acc1,8,$acc1
801                 xor     $acc0,$t0,$t0
802         ld      [$key+0],$s0
803         fmovs   %f0,%f0
804                 srlx    $acc2,16,$acc2          !
805                 xor     $acc1,$t0,$t0
806         ld      [$key+4],$s1
807                 srlx    $acc3,24,$acc3
808                 xor     $acc2,$t0,$t0
809         ld      [$key+8],$s2
810                 srlx    $acc5,8,$acc5
811                 xor     $acc3,$t0,$t0
812         ld      [$key+12],$s3                   !
813                 srlx    $acc6,16,$acc6
814                 xor     $acc4,$t1,$t1
815         fmovs   %f0,%f0
816                 srlx    $acc7,24,$acc7
817                 xor     $acc5,$t1,$t1
818                 srlx    $acc9,8,$acc9
819                 xor     $acc6,$t1,$t1
820                 srlx    $acc10,16,$acc10        !
821                 xor     $acc7,$t1,$t1
822                 srlx    $acc11,24,$acc11
823                 xor     $acc8,$t2,$t2
824                 srlx    $acc13,8,$acc13
825                 xor     $acc9,$t2,$t2
826                 srlx    $acc14,16,$acc14
827                 xor     $acc10,$t2,$t2
828                 srlx    $acc15,24,$acc15        !
829                 xor     $acc11,$t2,$t2
830                 xor     $acc12,$acc14,$acc14
831                 xor     $acc13,$t3,$t3
832         srl     $t0,21,$acc0
833                 xor     $acc14,$t3,$t3
834                 xor     $acc15,$t3,$t3
835         srl     $t3,13,$acc1
836
837         and     $acc0,2040,$acc0                !
838         srl     $t2,5,$acc2
839         and     $acc1,2040,$acc1
840         ldx     [$tbl+$acc0],$acc0
841         sll     $t1,3,$acc3
842         and     $acc2,2040,$acc2
843         ldx     [$tbl+$acc1],$acc1
844         fmovs   %f0,%f0
845         srl     $t1,21,$acc4                    !
846         and     $acc3,2040,$acc3
847         ldx     [$tbl+$acc2],$acc2
848         srl     $t0,13,$acc5
849         and     $acc4,2040,$acc4
850         ldx     [$tbl+$acc3],$acc3
851         srl     $t3,5,$acc6
852         and     $acc5,2040,$acc5
853         ldx     [$tbl+$acc4],$acc4              !
854         sll     $t2,3,$acc7
855         and     $acc6,2040,$acc6
856         ldx     [$tbl+$acc5],$acc5
857         srl     $t2,21,$acc8
858         and     $acc7,2040,$acc7
859         ldx     [$tbl+$acc6],$acc6
860         fmovs   %f0,%f0
861         srl     $t1,13,$acc9                    !
862         and     $acc8,2040,$acc8
863         ldx     [$tbl+$acc7],$acc7
864         srl     $t0,5,$acc10
865         and     $acc9,2040,$acc9
866         ldx     [$tbl+$acc8],$acc8
867         sll     $t3,3,$acc11
868         and     $acc10,2040,$acc10
869         ldx     [$tbl+$acc9],$acc9              !
870         srl     $t3,21,$acc12
871         and     $acc11,2040,$acc11
872         ldx     [$tbl+$acc10],$acc10
873         srl     $t2,13,$acc13
874         and     $acc12,2040,$acc12
875         ldx     [$tbl+$acc11],$acc11
876         fmovs   %f0,%f0
877         srl     $t1,5,$acc14                    !
878         and     $acc13,2040,$acc13
879         ldx     [$tbl+$acc12],$acc12
880         sll     $t0,3,$acc15
881         and     $acc14,2040,$acc14
882         ldx     [$tbl+$acc13],$acc13
883                 srlx    $acc1,8,$acc1
884         and     $acc15,2040,$acc15
885         ldx     [$tbl+$acc14],$acc14            !
886
887                 srlx    $acc2,16,$acc2
888                 xor     $acc0,$s0,$s0
889         ldx     [$tbl+$acc15],$acc15
890                 srlx    $acc3,24,$acc3
891                 xor     $acc1,$s0,$s0
892         ld      [$key+16],$t0
893         fmovs   %f0,%f0
894                 srlx    $acc5,8,$acc5           !
895                 xor     $acc2,$s0,$s0
896         ld      [$key+20],$t1
897                 srlx    $acc6,16,$acc6
898                 xor     $acc3,$s0,$s0
899         ld      [$key+24],$t2
900                 srlx    $acc7,24,$acc7
901                 xor     $acc4,$s1,$s1
902         ld      [$key+28],$t3                   !
903                 srlx    $acc9,8,$acc9
904                 xor     $acc5,$s1,$s1
905                 srlx    $acc10,16,$acc10
906                 xor     $acc6,$s1,$s1
907                 srlx    $acc11,24,$acc11
908                 xor     $acc7,$s1,$s1
909                 srlx    $acc13,8,$acc13
910                 xor     $acc8,$s2,$s2
911                 srlx    $acc14,16,$acc14        !
912                 xor     $acc9,$s2,$s2
913                 srlx    $acc15,24,$acc15
914                 xor     $acc10,$s2,$s2
915         srl     $s0,21,$acc0
916                 xor     $acc11,$s2,$s2
917                 xor     $acc12,$acc14,$acc14
918                 xor     $acc13,$s3,$s3
919         and     $acc0,2040,$acc0                !
920                 xor     $acc14,$s3,$s3
921                 xor     $acc15,$s3,$s3
922         ba      .Ldec_loop
923         srl     $s3,13,$acc1
924
925 .align  32
926 .Ldec_last:
927                 srlx    $acc1,8,$acc1           !
928                 xor     $acc0,$t0,$t0
929         ld      [$key+0],$s0
930                 srlx    $acc2,16,$acc2
931                 xor     $acc1,$t0,$t0
932         ld      [$key+4],$s1
933                 srlx    $acc3,24,$acc3
934                 xor     $acc2,$t0,$t0
935         ld      [$key+8],$s2                    !
936                 srlx    $acc5,8,$acc5
937                 xor     $acc3,$t0,$t0
938         ld      [$key+12],$s3
939                 srlx    $acc6,16,$acc6
940                 xor     $acc4,$t1,$t1
941                 srlx    $acc7,24,$acc7
942                 xor     $acc5,$t1,$t1
943                 srlx    $acc9,8,$acc9           !
944                 xor     $acc6,$t1,$t1
945                 srlx    $acc10,16,$acc10
946                 xor     $acc7,$t1,$t1
947                 srlx    $acc11,24,$acc11
948                 xor     $acc8,$t2,$t2
949                 srlx    $acc13,8,$acc13
950                 xor     $acc9,$t2,$t2
951                 srlx    $acc14,16,$acc14        !
952                 xor     $acc10,$t2,$t2
953                 srlx    $acc15,24,$acc15
954                 xor     $acc11,$t2,$t2
955                 xor     $acc12,$acc14,$acc14
956                 xor     $acc13,$t3,$t3
957         srl     $t0,24,$acc0
958                 xor     $acc14,$t3,$t3
959                 xor     $acc15,$t3,$t3          !
960         srl     $t3,16,$acc1
961
962         srl     $t2,8,$acc2
963         and     $acc1,255,$acc1
964         ldub    [$rounds+$acc0],$acc0
965         srl     $t1,24,$acc4
966         and     $acc2,255,$acc2
967         ldub    [$rounds+$acc1],$acc1
968         srl     $t0,16,$acc5                    !
969         and     $t1,255,$acc3
970         ldub    [$rounds+$acc2],$acc2
971         ldub    [$rounds+$acc3],$acc3
972         srl     $t3,8,$acc6
973         and     $acc5,255,$acc5
974         ldub    [$rounds+$acc4],$acc4
975         fmovs   %f0,%f0
976         srl     $t2,24,$acc8                    !
977         and     $acc6,255,$acc6
978         ldub    [$rounds+$acc5],$acc5
979         srl     $t1,16,$acc9
980         and     $t2,255,$acc7
981         ldub    [$rounds+$acc6],$acc6
982         ldub    [$rounds+$acc7],$acc7
983         fmovs   %f0,%f0
984         srl     $t0,8,$acc10                    !
985         and     $acc9,255,$acc9
986         ldub    [$rounds+$acc8],$acc8
987         srl     $t3,24,$acc12
988         and     $acc10,255,$acc10
989         ldub    [$rounds+$acc9],$acc9
990         srl     $t2,16,$acc13
991         and     $t3,255,$acc11
992         ldub    [$rounds+$acc10],$acc10         !
993         srl     $t1,8,$acc14
994         and     $acc13,255,$acc13
995         ldub    [$rounds+$acc11],$acc11
996         ldub    [$rounds+$acc12],$acc12
997         and     $acc14,255,$acc14
998         ldub    [$rounds+$acc13],$acc13
999         and     $t0,255,$acc15
1000         ldub    [$rounds+$acc14],$acc14         !
1001
1002                 sll     $acc0,24,$acc0
1003                 xor     $acc3,$s0,$s0
1004         ldub    [$rounds+$acc15],$acc15
1005                 sll     $acc1,16,$acc1
1006                 xor     $acc0,$s0,$s0
1007         ldx     [%sp+$bias+$frame+0],%i7        ! restore return address
1008         fmovs   %f0,%f0
1009                 sll     $acc2,8,$acc2           !
1010                 xor     $acc1,$s0,$s0
1011                 sll     $acc4,24,$acc4
1012                 xor     $acc2,$s0,$s0
1013                 sll     $acc5,16,$acc5
1014                 xor     $acc7,$s1,$s1
1015                 sll     $acc6,8,$acc6
1016                 xor     $acc4,$s1,$s1
1017                 sll     $acc8,24,$acc8          !
1018                 xor     $acc5,$s1,$s1
1019                 sll     $acc9,16,$acc9
1020                 xor     $acc11,$s2,$s2
1021                 sll     $acc10,8,$acc10
1022                 xor     $acc6,$s1,$s1
1023                 sll     $acc12,24,$acc12
1024                 xor     $acc8,$s2,$s2
1025                 sll     $acc13,16,$acc13        !
1026                 xor     $acc9,$s2,$s2
1027                 sll     $acc14,8,$acc14
1028                 xor     $acc10,$s2,$s2
1029                 xor     $acc12,$acc14,$acc14
1030                 xor     $acc13,$s3,$s3
1031                 xor     $acc14,$s3,$s3
1032                 xor     $acc15,$s3,$s3
1033
1034         ret
1035         restore
1036 .type   _sparcv9_AES_decrypt,#function
1037 .size   _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1038
1039 .align  32
1040 .globl  AES_decrypt
1041 AES_decrypt:
1042         or      %o0,%o1,%g1
1043         andcc   %g1,3,%g0
1044         bnz,pn  %xcc,.Lunaligned_dec
1045         save    %sp,-$frame,%sp
1046
1047         ld      [%i0+0],%o0
1048         ld      [%i0+4],%o1
1049         ld      [%i0+8],%o2
1050         ld      [%i0+12],%o3
1051
1052         mov     %i2,%o5
1053         nop
1054 1:      call    _sparcv9_AES_decrypt
1055         sub     %o7,1b-AES_Td,%o4
1056
1057         st      %o0,[%i1+0]
1058         st      %o1,[%i1+4]
1059         st      %o2,[%i1+8]
1060         st      %o3,[%i1+12]
1061
1062         ret
1063         restore
1064
1065 .align  32
1066 .Lunaligned_dec:
1067         ldub    [%i0+0],%l0
1068         ldub    [%i0+1],%l1
1069         ldub    [%i0+2],%l2
1070
1071         sll     %l0,24,%l0
1072         ldub    [%i0+3],%l3
1073         sll     %l1,16,%l1
1074         ldub    [%i0+4],%l4
1075         sll     %l2,8,%l2
1076         or      %l1,%l0,%l0
1077         ldub    [%i0+5],%l5
1078         sll     %l4,24,%l4
1079         or      %l3,%l2,%l2
1080         ldub    [%i0+6],%l6
1081         sll     %l5,16,%l5
1082         or      %l0,%l2,%o0
1083         ldub    [%i0+7],%l7
1084
1085         sll     %l6,8,%l6
1086         or      %l5,%l4,%l4
1087         ldub    [%i0+8],%l0
1088         or      %l7,%l6,%l6
1089         ldub    [%i0+9],%l1
1090         or      %l4,%l6,%o1
1091         ldub    [%i0+10],%l2
1092
1093         sll     %l0,24,%l0
1094         ldub    [%i0+11],%l3
1095         sll     %l1,16,%l1
1096         ldub    [%i0+12],%l4
1097         sll     %l2,8,%l2
1098         or      %l1,%l0,%l0
1099         ldub    [%i0+13],%l5
1100         sll     %l4,24,%l4
1101         or      %l3,%l2,%l2
1102         ldub    [%i0+14],%l6
1103         sll     %l5,16,%l5
1104         or      %l0,%l2,%o2
1105         ldub    [%i0+15],%l7
1106
1107         sll     %l6,8,%l6
1108         or      %l5,%l4,%l4
1109         or      %l7,%l6,%l6
1110         or      %l4,%l6,%o3
1111
1112         mov     %i2,%o5
1113         nop
1114 1:      call    _sparcv9_AES_decrypt
1115         sub     %o7,1b-AES_Td,%o4
1116
1117         srl     %o0,24,%l0
1118         srl     %o0,16,%l1
1119         stb     %l0,[%i1+0]
1120         srl     %o0,8,%l2
1121         stb     %l1,[%i1+1]
1122         stb     %l2,[%i1+2]
1123         srl     %o1,24,%l4
1124         stb     %o0,[%i1+3]
1125
1126         srl     %o1,16,%l5
1127         stb     %l4,[%i1+4]
1128         srl     %o1,8,%l6
1129         stb     %l5,[%i1+5]
1130         stb     %l6,[%i1+6]
1131         srl     %o2,24,%l0
1132         stb     %o1,[%i1+7]
1133
1134         srl     %o2,16,%l1
1135         stb     %l0,[%i1+8]
1136         srl     %o2,8,%l2
1137         stb     %l1,[%i1+9]
1138         stb     %l2,[%i1+10]
1139         srl     %o3,24,%l4
1140         stb     %o2,[%i1+11]
1141
1142         srl     %o3,16,%l5
1143         stb     %l4,[%i1+12]
1144         srl     %o3,8,%l6
1145         stb     %l5,[%i1+13]
1146         stb     %l6,[%i1+14]
1147         stb     %o3,[%i1+15]
1148
1149         ret
1150         restore
1151 .type   AES_decrypt,#function
1152 .size   AES_decrypt,(.-AES_decrypt)
1153 ___
1154
1155 # fmovs instructions substituting for FP nops were originally added
1156 # to meet specific instruction alignment requirements to maximize ILP.
1157 # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1158 # undesired effect, so just omit them and sacrifice some portion of
1159 # percent in performance...
1160 $code =~ s/fmovs.*$//gem;
1161
1162 print $code;