30f38d7a5a2ec060b97513cf16939c3ca146434b
[openssl.git] / crypto / aes / asm / aes-sparcv9.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
8 #
9 # Version 1.0
10 #
11 # The major reason for undertaken effort was to mitigate the hazard of
12 # cache-timing attack. This is [currently and initially!] addressed in
13 # two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
14 # 2. References to them are scheduled for L2 cache latency, meaning
15 # that the tables don't have to reside in L1 cache. Once again, this
16 # is an initial draft and one should expect more countermeasures to
17 # be implemented...
18 #
19 # Even though performance was not the primary goal [on the contrary,
20 # extra shifts "induced" by compressed S-box and longer loop epilogue
21 # "induced" by scheduling for L2 have negative effect on performance],
22 # the code turned out to run in ~23 cycles per processed byte en-/
23 # decrypted with 128-bit key. This is pretty good result for code
24 # with mentioned qualities and UltraSPARC core. Compared to Sun C
25 # generated code my encrypt procedure runs just few percents faster,
26 # while decrypt one - whole 50% faster [yes, Sun C failed to generate
27 # optimal decrypt procedure]. Compared to GNU C generated code both
28 # procedures are more than 60% faster:-)
29
30 $bits=32;
31 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
32 if ($bits==64)  { $bias=2047; $frame=192; }
33 else            { $bias=0;    $frame=112; }
34 $locals=16;
35
36 $acc0="%l0";
37 $acc1="%o0";
38 $acc2="%o1";
39 $acc3="%o2";
40
41 $acc4="%l1";
42 $acc5="%o3";
43 $acc6="%o4";
44 $acc7="%o5";
45
46 $acc8="%l2";
47 $acc9="%o7";
48 $acc10="%g1";
49 $acc11="%g2";
50
51 $acc12="%l3";
52 $acc13="%g3";
53 $acc14="%g4";
54 $acc15="%g5";
55
56 $t0="%l4";
57 $t1="%l5";
58 $t2="%l6";
59 $t3="%l7";
60
61 $s0="%i0";
62 $s1="%i1";
63 $s2="%i2";
64 $s3="%i3";
65 $tbl="%i4";
66 $key="%i5";
67 $rounds="%i7";  # aliases with return address, which is off-loaded to stack
68
69 sub _data_word()
70 { my $i;
71     while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
72 }
73
74 $code.=<<___ if ($bits==64);
75 .register       %g2,#scratch
76 .register       %g3,#scratch
77 ___
78 $code.=<<___;
79 .section        ".text",#alloc,#execinstr
80
81 .align  64
82 AES_Te:
83 ___
84 &_data_word(
85         0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
86         0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
87         0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
88         0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
89         0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
90         0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
91         0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
92         0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
93         0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
94         0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
95         0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
96         0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
97         0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
98         0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
99         0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
100         0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
101         0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
102         0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
103         0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
104         0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
105         0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
106         0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
107         0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
108         0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
109         0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
110         0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
111         0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
112         0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
113         0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
114         0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
115         0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
116         0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
117         0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
118         0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
119         0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
120         0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
121         0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
122         0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
123         0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
124         0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
125         0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
126         0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
127         0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
128         0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
129         0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
130         0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
131         0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
132         0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
133         0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
134         0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
135         0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
136         0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
137         0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
138         0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
139         0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
140         0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
141         0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
142         0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
143         0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
144         0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
145         0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
146         0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
147         0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
148         0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
149 $code.=<<___;
150         .byte   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
151         .byte   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
152         .byte   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
153         .byte   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
154         .byte   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
155         .byte   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
156         .byte   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
157         .byte   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
158         .byte   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
159         .byte   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
160         .byte   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
161         .byte   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
162         .byte   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
163         .byte   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
164         .byte   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
165         .byte   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
166         .byte   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
167         .byte   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
168         .byte   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
169         .byte   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
170         .byte   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
171         .byte   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
172         .byte   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
173         .byte   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
174         .byte   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
175         .byte   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
176         .byte   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
177         .byte   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
178         .byte   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
179         .byte   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
180         .byte   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
181         .byte   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
182 .type   AES_Te,#object
183 .size   AES_Te,(.-AES_Te)
184
185 .align  64
186 .skip   16
187 _sparcv9_AES_encrypt:
188         save    %sp,-$frame-$locals,%sp
189         stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
190         ld      [$key+240],$rounds
191         ld      [$key+0],$t0
192         ld      [$key+4],$t1                    !
193         ld      [$key+8],$t2
194         srl     $rounds,1,$rounds
195         xor     $t0,$s0,$s0
196         ld      [$key+12],$t3
197         srl     $s0,21,$acc0
198         xor     $t1,$s1,$s1
199         ld      [$key+16],$t0
200         srl     $s1,13,$acc1                    !
201         xor     $t2,$s2,$s2
202         ld      [$key+20],$t1
203         xor     $t3,$s3,$s3
204         ld      [$key+24],$t2
205         and     $acc0,2040,$acc0
206         ld      [$key+28],$t3
207         nop
208 .Lenc_loop:
209         srl     $s2,5,$acc2                     !
210         and     $acc1,2040,$acc1
211         ldx     [$tbl+$acc0],$acc0
212         sll     $s3,3,$acc3
213         and     $acc2,2040,$acc2
214         ldx     [$tbl+$acc1],$acc1
215         srl     $s1,21,$acc4
216         and     $acc3,2040,$acc3
217         ldx     [$tbl+$acc2],$acc2              !
218         srl     $s2,13,$acc5
219         and     $acc4,2040,$acc4
220         ldx     [$tbl+$acc3],$acc3
221         srl     $s3,5,$acc6
222         and     $acc5,2040,$acc5
223         ldx     [$tbl+$acc4],$acc4
224         fmovs   %f0,%f0
225         sll     $s0,3,$acc7                     !
226         and     $acc6,2040,$acc6
227         ldx     [$tbl+$acc5],$acc5
228         srl     $s2,21,$acc8
229         and     $acc7,2040,$acc7
230         ldx     [$tbl+$acc6],$acc6
231         srl     $s3,13,$acc9
232         and     $acc8,2040,$acc8
233         ldx     [$tbl+$acc7],$acc7              !
234         srl     $s0,5,$acc10
235         and     $acc9,2040,$acc9
236         ldx     [$tbl+$acc8],$acc8
237         sll     $s1,3,$acc11
238         and     $acc10,2040,$acc10
239         ldx     [$tbl+$acc9],$acc9
240         fmovs   %f0,%f0
241         srl     $s3,21,$acc12                   !
242         and     $acc11,2040,$acc11
243         ldx     [$tbl+$acc10],$acc10
244         srl     $s0,13,$acc13
245         and     $acc12,2040,$acc12
246         ldx     [$tbl+$acc11],$acc11
247         srl     $s1,5,$acc14
248         and     $acc13,2040,$acc13
249         ldx     [$tbl+$acc12],$acc12            !
250         sll     $s2,3,$acc15
251         and     $acc14,2040,$acc14
252         ldx     [$tbl+$acc13],$acc13
253         and     $acc15,2040,$acc15
254         add     $key,32,$key
255         ldx     [$tbl+$acc14],$acc14
256         fmovs   %f0,%f0
257         subcc   $rounds,1,$rounds               !
258         ldx     [$tbl+$acc15],$acc15
259         bz,a,pn %icc,.Lenc_last
260         add     $tbl,2048,$rounds
261
262                 srlx    $acc1,8,$acc1
263                 xor     $acc0,$t0,$t0
264         ld      [$key+0],$s0
265         fmovs   %f0,%f0
266                 srlx    $acc2,16,$acc2          !
267                 xor     $acc1,$t0,$t0
268         ld      [$key+4],$s1
269                 srlx    $acc3,24,$acc3
270                 xor     $acc2,$t0,$t0
271         ld      [$key+8],$s2
272                 srlx    $acc5,8,$acc5
273                 xor     $acc3,$t0,$t0
274         ld      [$key+12],$s3                   !
275                 srlx    $acc6,16,$acc6
276                 xor     $acc4,$t1,$t1
277         fmovs   %f0,%f0
278                 srlx    $acc7,24,$acc7
279                 xor     $acc5,$t1,$t1
280                 srlx    $acc9,8,$acc9
281                 xor     $acc6,$t1,$t1
282                 srlx    $acc10,16,$acc10        !
283                 xor     $acc7,$t1,$t1
284                 srlx    $acc11,24,$acc11
285                 xor     $acc8,$t2,$t2
286                 srlx    $acc13,8,$acc13
287                 xor     $acc9,$t2,$t2
288                 srlx    $acc14,16,$acc14
289                 xor     $acc10,$t2,$t2
290                 srlx    $acc15,24,$acc15        !
291                 xor     $acc11,$t2,$t2
292                 xor     $acc12,$acc14,$acc14
293                 xor     $acc13,$t3,$t3
294         srl     $t0,21,$acc0
295                 xor     $acc14,$t3,$t3
296         srl     $t1,13,$acc1
297                 xor     $acc15,$t3,$t3
298
299         and     $acc0,2040,$acc0                !
300         srl     $t2,5,$acc2
301         and     $acc1,2040,$acc1
302         ldx     [$tbl+$acc0],$acc0
303         sll     $t3,3,$acc3
304         and     $acc2,2040,$acc2
305         ldx     [$tbl+$acc1],$acc1
306         fmovs   %f0,%f0
307         srl     $t1,21,$acc4                    !
308         and     $acc3,2040,$acc3
309         ldx     [$tbl+$acc2],$acc2
310         srl     $t2,13,$acc5
311         and     $acc4,2040,$acc4
312         ldx     [$tbl+$acc3],$acc3
313         srl     $t3,5,$acc6
314         and     $acc5,2040,$acc5
315         ldx     [$tbl+$acc4],$acc4              !
316         sll     $t0,3,$acc7
317         and     $acc6,2040,$acc6
318         ldx     [$tbl+$acc5],$acc5
319         srl     $t2,21,$acc8
320         and     $acc7,2040,$acc7
321         ldx     [$tbl+$acc6],$acc6
322         fmovs   %f0,%f0
323         srl     $t3,13,$acc9                    !
324         and     $acc8,2040,$acc8
325         ldx     [$tbl+$acc7],$acc7
326         srl     $t0,5,$acc10
327         and     $acc9,2040,$acc9
328         ldx     [$tbl+$acc8],$acc8
329         sll     $t1,3,$acc11
330         and     $acc10,2040,$acc10
331         ldx     [$tbl+$acc9],$acc9              !
332         srl     $t3,21,$acc12
333         and     $acc11,2040,$acc11
334         ldx     [$tbl+$acc10],$acc10
335         srl     $t0,13,$acc13
336         and     $acc12,2040,$acc12
337         ldx     [$tbl+$acc11],$acc11
338         fmovs   %f0,%f0
339         srl     $t1,5,$acc14                    !
340         and     $acc13,2040,$acc13
341         ldx     [$tbl+$acc12],$acc12
342         sll     $t2,3,$acc15
343         and     $acc14,2040,$acc14
344         ldx     [$tbl+$acc13],$acc13
345                 srlx    $acc1,8,$acc1
346         and     $acc15,2040,$acc15
347         ldx     [$tbl+$acc14],$acc14            !
348
349                 srlx    $acc2,16,$acc2
350                 xor     $acc0,$s0,$s0
351         ldx     [$tbl+$acc15],$acc15
352                 srlx    $acc3,24,$acc3
353                 xor     $acc1,$s0,$s0
354         ld      [$key+16],$t0
355         fmovs   %f0,%f0
356                 srlx    $acc5,8,$acc5           !
357                 xor     $acc2,$s0,$s0
358         ld      [$key+20],$t1
359                 srlx    $acc6,16,$acc6
360                 xor     $acc3,$s0,$s0
361         ld      [$key+24],$t2
362                 srlx    $acc7,24,$acc7
363                 xor     $acc4,$s1,$s1
364         ld      [$key+28],$t3                   !
365                 srlx    $acc9,8,$acc9
366                 xor     $acc5,$s1,$s1
367                 srlx    $acc10,16,$acc10
368                 xor     $acc6,$s1,$s1
369                 srlx    $acc11,24,$acc11
370                 xor     $acc7,$s1,$s1
371                 srlx    $acc13,8,$acc13
372                 xor     $acc8,$s2,$s2
373                 srlx    $acc14,16,$acc14        !
374                 xor     $acc9,$s2,$s2
375                 srlx    $acc15,24,$acc15
376                 xor     $acc10,$s2,$s2
377         srl     $s0,21,$acc0
378                 xor     $acc11,$s2,$s2
379                 xor     $acc12,$acc14,$acc14
380                 xor     $acc13,$s3,$s3
381         srl     $s1,13,$acc1                    !
382                 xor     $acc14,$s3,$s3
383                 xor     $acc15,$s3,$s3
384         ba      .Lenc_loop
385         and     $acc0,2040,$acc0
386
387 .align  32
388 .Lenc_last:
389                 srlx    $acc1,8,$acc1           !
390                 xor     $acc0,$t0,$t0
391         ld      [$key+0],$s0
392                 srlx    $acc2,16,$acc2
393                 xor     $acc1,$t0,$t0
394         ld      [$key+4],$s1
395                 srlx    $acc3,24,$acc3
396                 xor     $acc2,$t0,$t0
397         ld      [$key+8],$s2                    !
398                 srlx    $acc5,8,$acc5
399                 xor     $acc3,$t0,$t0
400         ld      [$key+12],$s3
401                 srlx    $acc6,16,$acc6
402                 xor     $acc4,$t1,$t1
403                 srlx    $acc7,24,$acc7
404                 xor     $acc5,$t1,$t1
405                 srlx    $acc9,8,$acc9           !
406                 xor     $acc6,$t1,$t1
407                 srlx    $acc10,16,$acc10
408                 xor     $acc7,$t1,$t1
409                 srlx    $acc11,24,$acc11
410                 xor     $acc8,$t2,$t2
411                 srlx    $acc13,8,$acc13
412                 xor     $acc9,$t2,$t2
413                 srlx    $acc14,16,$acc14        !
414                 xor     $acc10,$t2,$t2
415                 srlx    $acc15,24,$acc15
416                 xor     $acc11,$t2,$t2
417                 xor     $acc12,$acc14,$acc14
418                 xor     $acc13,$t3,$t3
419         srl     $t0,24,$acc0
420                 xor     $acc14,$t3,$t3
421         srl     $t1,16,$acc1                    !
422                 xor     $acc15,$t3,$t3
423
424         srl     $t2,8,$acc2
425         and     $acc1,255,$acc1
426         ldub    [$rounds+$acc0],$acc0
427         srl     $t1,24,$acc4
428         and     $acc2,255,$acc2
429         ldub    [$rounds+$acc1],$acc1
430         srl     $t2,16,$acc5                    !
431         and     $t3,255,$acc3
432         ldub    [$rounds+$acc2],$acc2
433         ldub    [$rounds+$acc3],$acc3
434         srl     $t3,8,$acc6
435         and     $acc5,255,$acc5
436         ldub    [$rounds+$acc4],$acc4
437         fmovs   %f0,%f0
438         srl     $t2,24,$acc8                    !
439         and     $acc6,255,$acc6
440         ldub    [$rounds+$acc5],$acc5
441         srl     $t3,16,$acc9
442         and     $t0,255,$acc7
443         ldub    [$rounds+$acc6],$acc6
444         ldub    [$rounds+$acc7],$acc7
445         fmovs   %f0,%f0
446         srl     $t0,8,$acc10                    !
447         and     $acc9,255,$acc9
448         ldub    [$rounds+$acc8],$acc8
449         srl     $t3,24,$acc12
450         and     $acc10,255,$acc10
451         ldub    [$rounds+$acc9],$acc9
452         srl     $t0,16,$acc13
453         and     $t1,255,$acc11
454         ldub    [$rounds+$acc10],$acc10         !
455         srl     $t1,8,$acc14
456         and     $acc13,255,$acc13
457         ldub    [$rounds+$acc11],$acc11
458         ldub    [$rounds+$acc12],$acc12
459         and     $acc14,255,$acc14
460         ldub    [$rounds+$acc13],$acc13
461         and     $t2,255,$acc15
462         ldub    [$rounds+$acc14],$acc14         !
463
464                 sll     $acc0,24,$acc0
465                 xor     $acc3,$s0,$s0
466         ldub    [$rounds+$acc15],$acc15
467                 sll     $acc1,16,$acc1
468                 xor     $acc0,$s0,$s0
469         ldx     [%sp+$bias+$frame+0],%i7        ! restore return address
470         fmovs   %f0,%f0
471                 sll     $acc2,8,$acc2           !
472                 xor     $acc1,$s0,$s0
473                 sll     $acc4,24,$acc4
474                 xor     $acc2,$s0,$s0
475                 sll     $acc5,16,$acc5
476                 xor     $acc7,$s1,$s1
477                 sll     $acc6,8,$acc6
478                 xor     $acc4,$s1,$s1
479                 sll     $acc8,24,$acc8          !
480                 xor     $acc5,$s1,$s1
481                 sll     $acc9,16,$acc9
482                 xor     $acc11,$s2,$s2
483                 sll     $acc10,8,$acc10
484                 xor     $acc6,$s1,$s1
485                 sll     $acc12,24,$acc12
486                 xor     $acc8,$s2,$s2
487                 sll     $acc13,16,$acc13        !
488                 xor     $acc9,$s2,$s2
489                 sll     $acc14,8,$acc14
490                 xor     $acc10,$s2,$s2
491                 xor     $acc12,$acc14,$acc14
492                 xor     $acc13,$s3,$s3
493                 xor     $acc14,$s3,$s3
494                 xor     $acc15,$s3,$s3
495
496         ret
497         restore
498 .type   _sparcv9_AES_encrypt,#function
499 .size   _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
500
501 .align  32
502 .globl  AES_encrypt
503 AES_encrypt:
504         or      %o0,%o1,%g1
505         andcc   %g1,3,%g0
506         bnz,pn  %xcc,.Lunaligned_enc
507         save    %sp,-$frame,%sp
508
509         ld      [%i0+0],%o0
510         ld      [%i0+4],%o1
511         ld      [%i0+8],%o2
512         ld      [%i0+12],%o3
513
514 1:      call    .+8
515         mov     %i2,%o5
516         sub     %o7,1b-AES_Te,%o4
517         call    _sparcv9_AES_encrypt
518         nop
519
520         st      %o0,[%i1+0]
521         st      %o1,[%i1+4]
522         st      %o2,[%i1+8]
523         st      %o3,[%i1+12]
524
525         ret
526         restore
527
528 .align  32
529 .Lunaligned_enc:
530         ldub    [%i0+0],%l0
531         ldub    [%i0+1],%l1
532         ldub    [%i0+2],%l2
533
534         sll     %l0,24,%l0
535         ldub    [%i0+3],%l3
536         sll     %l1,16,%l1
537         ldub    [%i0+4],%l4
538         sll     %l2,8,%l2
539         or      %l1,%l0,%l0
540         ldub    [%i0+5],%l5
541         sll     %l4,24,%l4
542         or      %l3,%l2,%l2
543         ldub    [%i0+6],%l6
544         sll     %l5,16,%l5
545         or      %l0,%l2,%o0
546         ldub    [%i0+7],%l7
547
548         sll     %l6,8,%l6
549         or      %l5,%l4,%l4
550         ldub    [%i0+8],%l0
551         or      %l7,%l6,%l6
552         ldub    [%i0+9],%l1
553         or      %l4,%l6,%o1
554         ldub    [%i0+10],%l2
555
556         sll     %l0,24,%l0
557         ldub    [%i0+11],%l3
558         sll     %l1,16,%l1
559         ldub    [%i0+12],%l4
560         sll     %l2,8,%l2
561         or      %l1,%l0,%l0
562         ldub    [%i0+13],%l5
563         sll     %l4,24,%l4
564         or      %l3,%l2,%l2
565         ldub    [%i0+14],%l6
566         sll     %l5,16,%l5
567         or      %l0,%l2,%o2
568         ldub    [%i0+15],%l7
569
570         sll     %l6,8,%l6
571         or      %l5,%l4,%l4
572         or      %l7,%l6,%l6
573         or      %l4,%l6,%o3
574
575 1:      call    .+8
576         mov     %i2,%o5
577         sub     %o7,1b-AES_Te,%o4
578         call    _sparcv9_AES_encrypt
579         nop
580
581         srl     %o0,24,%l0
582         srl     %o0,16,%l1
583         stb     %l0,[%i1+0]
584         srl     %o0,8,%l2
585         stb     %l1,[%i1+1]
586         stb     %l2,[%i1+2]
587         srl     %o1,24,%l4
588         stb     %o0,[%i1+3]
589
590         srl     %o1,16,%l5
591         stb     %l4,[%i1+4]
592         srl     %o1,8,%l6
593         stb     %l5,[%i1+5]
594         stb     %l6,[%i1+6]
595         srl     %o2,24,%l0
596         stb     %o1,[%i1+7]
597
598         srl     %o2,16,%l1
599         stb     %l0,[%i1+8]
600         srl     %o2,8,%l2
601         stb     %l1,[%i1+9]
602         stb     %l2,[%i1+10]
603         srl     %o3,24,%l4
604         stb     %o2,[%i1+11]
605
606         srl     %o3,16,%l5
607         stb     %l4,[%i1+12]
608         srl     %o3,8,%l6
609         stb     %l5,[%i1+13]
610         stb     %l6,[%i1+14]
611         stb     %o3,[%i1+15]
612
613         ret
614         restore
615 .type   AES_encrypt,#function
616 .size   AES_encrypt,(.-AES_encrypt)
617
618 ___
619
620 $code.=<<___;
621 .align  64
622 AES_Td:
623 ___
624 &_data_word(
625         0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
626         0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
627         0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
628         0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
629         0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
630         0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
631         0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
632         0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
633         0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
634         0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
635         0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
636         0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
637         0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
638         0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
639         0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
640         0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
641         0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
642         0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
643         0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
644         0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
645         0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
646         0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
647         0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
648         0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
649         0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
650         0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
651         0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
652         0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
653         0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
654         0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
655         0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
656         0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
657         0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
658         0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
659         0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
660         0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
661         0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
662         0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
663         0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
664         0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
665         0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
666         0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
667         0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
668         0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
669         0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
670         0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
671         0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
672         0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
673         0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
674         0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
675         0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
676         0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
677         0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
678         0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
679         0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
680         0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
681         0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
682         0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
683         0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
684         0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
685         0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
686         0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
687         0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
688         0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
689 $code.=<<___;
690         .byte   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
691         .byte   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
692         .byte   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
693         .byte   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
694         .byte   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
695         .byte   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
696         .byte   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
697         .byte   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
698         .byte   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
699         .byte   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
700         .byte   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
701         .byte   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
702         .byte   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
703         .byte   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
704         .byte   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
705         .byte   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
706         .byte   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
707         .byte   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
708         .byte   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
709         .byte   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
710         .byte   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
711         .byte   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
712         .byte   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
713         .byte   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
714         .byte   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
715         .byte   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
716         .byte   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
717         .byte   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
718         .byte   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
719         .byte   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
720         .byte   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
721         .byte   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
722 .type   AES_Td,#object
723 .size   AES_Td,(.-AES_Td)
724
725 .align  64
726 .skip   16
727 _sparcv9_AES_decrypt:
728         save    %sp,-$frame-$locals,%sp
729         stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
730         ld      [$key+240],$rounds
731         ld      [$key+0],$t0
732         ld      [$key+4],$t1                    !
733         ld      [$key+8],$t2
734         ld      [$key+12],$t3
735         srl     $rounds,1,$rounds
736         xor     $t0,$s0,$s0
737         ld      [$key+16],$t0
738         xor     $t1,$s1,$s1
739         ld      [$key+20],$t1
740         srl     $s0,21,$acc0                    !
741         xor     $t2,$s2,$s2
742         ld      [$key+24],$t2
743         xor     $t3,$s3,$s3
744         and     $acc0,2040,$acc0
745         ld      [$key+28],$t3
746         srl     $s3,13,$acc1
747         nop
748 .Ldec_loop:
749         srl     $s2,5,$acc2                     !
750         and     $acc1,2040,$acc1
751         ldx     [$tbl+$acc0],$acc0
752         sll     $s1,3,$acc3
753         and     $acc2,2040,$acc2
754         ldx     [$tbl+$acc1],$acc1
755         srl     $s1,21,$acc4
756         and     $acc3,2040,$acc3
757         ldx     [$tbl+$acc2],$acc2              !
758         srl     $s0,13,$acc5
759         and     $acc4,2040,$acc4
760         ldx     [$tbl+$acc3],$acc3
761         srl     $s3,5,$acc6
762         and     $acc5,2040,$acc5
763         ldx     [$tbl+$acc4],$acc4
764         fmovs   %f0,%f0
765         sll     $s2,3,$acc7                     !
766         and     $acc6,2040,$acc6
767         ldx     [$tbl+$acc5],$acc5
768         srl     $s2,21,$acc8
769         and     $acc7,2040,$acc7
770         ldx     [$tbl+$acc6],$acc6
771         srl     $s1,13,$acc9
772         and     $acc8,2040,$acc8
773         ldx     [$tbl+$acc7],$acc7              !
774         srl     $s0,5,$acc10
775         and     $acc9,2040,$acc9
776         ldx     [$tbl+$acc8],$acc8
777         sll     $s3,3,$acc11
778         and     $acc10,2040,$acc10
779         ldx     [$tbl+$acc9],$acc9
780         fmovs   %f0,%f0
781         srl     $s3,21,$acc12                   !
782         and     $acc11,2040,$acc11
783         ldx     [$tbl+$acc10],$acc10
784         srl     $s2,13,$acc13
785         and     $acc12,2040,$acc12
786         ldx     [$tbl+$acc11],$acc11
787         srl     $s1,5,$acc14
788         and     $acc13,2040,$acc13
789         ldx     [$tbl+$acc12],$acc12            !
790         sll     $s0,3,$acc15
791         and     $acc14,2040,$acc14
792         ldx     [$tbl+$acc13],$acc13
793         and     $acc15,2040,$acc15
794         add     $key,32,$key
795         ldx     [$tbl+$acc14],$acc14
796         fmovs   %f0,%f0
797         subcc   $rounds,1,$rounds               !
798         ldx     [$tbl+$acc15],$acc15
799         bz,a,pn %icc,.Ldec_last
800         add     $tbl,2048,$rounds
801
802                 srlx    $acc1,8,$acc1
803                 xor     $acc0,$t0,$t0
804         ld      [$key+0],$s0
805         fmovs   %f0,%f0
806                 srlx    $acc2,16,$acc2          !
807                 xor     $acc1,$t0,$t0
808         ld      [$key+4],$s1
809                 srlx    $acc3,24,$acc3
810                 xor     $acc2,$t0,$t0
811         ld      [$key+8],$s2
812                 srlx    $acc5,8,$acc5
813                 xor     $acc3,$t0,$t0
814         ld      [$key+12],$s3                   !
815                 srlx    $acc6,16,$acc6
816                 xor     $acc4,$t1,$t1
817         fmovs   %f0,%f0
818                 srlx    $acc7,24,$acc7
819                 xor     $acc5,$t1,$t1
820                 srlx    $acc9,8,$acc9
821                 xor     $acc6,$t1,$t1
822                 srlx    $acc10,16,$acc10        !
823                 xor     $acc7,$t1,$t1
824                 srlx    $acc11,24,$acc11
825                 xor     $acc8,$t2,$t2
826                 srlx    $acc13,8,$acc13
827                 xor     $acc9,$t2,$t2
828                 srlx    $acc14,16,$acc14
829                 xor     $acc10,$t2,$t2
830                 srlx    $acc15,24,$acc15        !
831                 xor     $acc11,$t2,$t2
832                 xor     $acc12,$acc14,$acc14
833                 xor     $acc13,$t3,$t3
834         srl     $t0,21,$acc0
835                 xor     $acc14,$t3,$t3
836                 xor     $acc15,$t3,$t3
837         srl     $t3,13,$acc1
838
839         and     $acc0,2040,$acc0                !
840         srl     $t2,5,$acc2
841         and     $acc1,2040,$acc1
842         ldx     [$tbl+$acc0],$acc0
843         sll     $t1,3,$acc3
844         and     $acc2,2040,$acc2
845         ldx     [$tbl+$acc1],$acc1
846         fmovs   %f0,%f0
847         srl     $t1,21,$acc4                    !
848         and     $acc3,2040,$acc3
849         ldx     [$tbl+$acc2],$acc2
850         srl     $t0,13,$acc5
851         and     $acc4,2040,$acc4
852         ldx     [$tbl+$acc3],$acc3
853         srl     $t3,5,$acc6
854         and     $acc5,2040,$acc5
855         ldx     [$tbl+$acc4],$acc4              !
856         sll     $t2,3,$acc7
857         and     $acc6,2040,$acc6
858         ldx     [$tbl+$acc5],$acc5
859         srl     $t2,21,$acc8
860         and     $acc7,2040,$acc7
861         ldx     [$tbl+$acc6],$acc6
862         fmovs   %f0,%f0
863         srl     $t1,13,$acc9                    !
864         and     $acc8,2040,$acc8
865         ldx     [$tbl+$acc7],$acc7
866         srl     $t0,5,$acc10
867         and     $acc9,2040,$acc9
868         ldx     [$tbl+$acc8],$acc8
869         sll     $t3,3,$acc11
870         and     $acc10,2040,$acc10
871         ldx     [$tbl+$acc9],$acc9              !
872         srl     $t3,21,$acc12
873         and     $acc11,2040,$acc11
874         ldx     [$tbl+$acc10],$acc10
875         srl     $t2,13,$acc13
876         and     $acc12,2040,$acc12
877         ldx     [$tbl+$acc11],$acc11
878         fmovs   %f0,%f0
879         srl     $t1,5,$acc14                    !
880         and     $acc13,2040,$acc13
881         ldx     [$tbl+$acc12],$acc12
882         sll     $t0,3,$acc15
883         and     $acc14,2040,$acc14
884         ldx     [$tbl+$acc13],$acc13
885                 srlx    $acc1,8,$acc1
886         and     $acc15,2040,$acc15
887         ldx     [$tbl+$acc14],$acc14            !
888
889                 srlx    $acc2,16,$acc2
890                 xor     $acc0,$s0,$s0
891         ldx     [$tbl+$acc15],$acc15
892                 srlx    $acc3,24,$acc3
893                 xor     $acc1,$s0,$s0
894         ld      [$key+16],$t0
895         fmovs   %f0,%f0
896                 srlx    $acc5,8,$acc5           !
897                 xor     $acc2,$s0,$s0
898         ld      [$key+20],$t1
899                 srlx    $acc6,16,$acc6
900                 xor     $acc3,$s0,$s0
901         ld      [$key+24],$t2
902                 srlx    $acc7,24,$acc7
903                 xor     $acc4,$s1,$s1
904         ld      [$key+28],$t3                   !
905                 srlx    $acc9,8,$acc9
906                 xor     $acc5,$s1,$s1
907                 srlx    $acc10,16,$acc10
908                 xor     $acc6,$s1,$s1
909                 srlx    $acc11,24,$acc11
910                 xor     $acc7,$s1,$s1
911                 srlx    $acc13,8,$acc13
912                 xor     $acc8,$s2,$s2
913                 srlx    $acc14,16,$acc14        !
914                 xor     $acc9,$s2,$s2
915                 srlx    $acc15,24,$acc15
916                 xor     $acc10,$s2,$s2
917         srl     $s0,21,$acc0
918                 xor     $acc11,$s2,$s2
919                 xor     $acc12,$acc14,$acc14
920                 xor     $acc13,$s3,$s3
921         and     $acc0,2040,$acc0                !
922                 xor     $acc14,$s3,$s3
923                 xor     $acc15,$s3,$s3
924         ba      .Ldec_loop
925         srl     $s3,13,$acc1
926
927 .align  32
928 .Ldec_last:
929                 srlx    $acc1,8,$acc1           !
930                 xor     $acc0,$t0,$t0
931         ld      [$key+0],$s0
932                 srlx    $acc2,16,$acc2
933                 xor     $acc1,$t0,$t0
934         ld      [$key+4],$s1
935                 srlx    $acc3,24,$acc3
936                 xor     $acc2,$t0,$t0
937         ld      [$key+8],$s2                    !
938                 srlx    $acc5,8,$acc5
939                 xor     $acc3,$t0,$t0
940         ld      [$key+12],$s3
941                 srlx    $acc6,16,$acc6
942                 xor     $acc4,$t1,$t1
943                 srlx    $acc7,24,$acc7
944                 xor     $acc5,$t1,$t1
945                 srlx    $acc9,8,$acc9           !
946                 xor     $acc6,$t1,$t1
947                 srlx    $acc10,16,$acc10
948                 xor     $acc7,$t1,$t1
949                 srlx    $acc11,24,$acc11
950                 xor     $acc8,$t2,$t2
951                 srlx    $acc13,8,$acc13
952                 xor     $acc9,$t2,$t2
953                 srlx    $acc14,16,$acc14        !
954                 xor     $acc10,$t2,$t2
955                 srlx    $acc15,24,$acc15
956                 xor     $acc11,$t2,$t2
957                 xor     $acc12,$acc14,$acc14
958                 xor     $acc13,$t3,$t3
959         srl     $t0,24,$acc0
960                 xor     $acc14,$t3,$t3
961                 xor     $acc15,$t3,$t3          !
962         srl     $t3,16,$acc1
963
964         srl     $t2,8,$acc2
965         and     $acc1,255,$acc1
966         ldub    [$rounds+$acc0],$acc0
967         srl     $t1,24,$acc4
968         and     $acc2,255,$acc2
969         ldub    [$rounds+$acc1],$acc1
970         srl     $t0,16,$acc5                    !
971         and     $t1,255,$acc3
972         ldub    [$rounds+$acc2],$acc2
973         ldub    [$rounds+$acc3],$acc3
974         srl     $t3,8,$acc6
975         and     $acc5,255,$acc5
976         ldub    [$rounds+$acc4],$acc4
977         fmovs   %f0,%f0
978         srl     $t2,24,$acc8                    !
979         and     $acc6,255,$acc6
980         ldub    [$rounds+$acc5],$acc5
981         srl     $t1,16,$acc9
982         and     $t2,255,$acc7
983         ldub    [$rounds+$acc6],$acc6
984         ldub    [$rounds+$acc7],$acc7
985         fmovs   %f0,%f0
986         srl     $t0,8,$acc10                    !
987         and     $acc9,255,$acc9
988         ldub    [$rounds+$acc8],$acc8
989         srl     $t3,24,$acc12
990         and     $acc10,255,$acc10
991         ldub    [$rounds+$acc9],$acc9
992         srl     $t2,16,$acc13
993         and     $t3,255,$acc11
994         ldub    [$rounds+$acc10],$acc10         !
995         srl     $t1,8,$acc14
996         and     $acc13,255,$acc13
997         ldub    [$rounds+$acc11],$acc11
998         ldub    [$rounds+$acc12],$acc12
999         and     $acc14,255,$acc14
1000         ldub    [$rounds+$acc13],$acc13
1001         and     $t0,255,$acc15
1002         ldub    [$rounds+$acc14],$acc14         !
1003
1004                 sll     $acc0,24,$acc0
1005                 xor     $acc3,$s0,$s0
1006         ldub    [$rounds+$acc15],$acc15
1007                 sll     $acc1,16,$acc1
1008                 xor     $acc0,$s0,$s0
1009         ldx     [%sp+$bias+$frame+0],%i7        ! restore return address
1010         fmovs   %f0,%f0
1011                 sll     $acc2,8,$acc2           !
1012                 xor     $acc1,$s0,$s0
1013                 sll     $acc4,24,$acc4
1014                 xor     $acc2,$s0,$s0
1015                 sll     $acc5,16,$acc5
1016                 xor     $acc7,$s1,$s1
1017                 sll     $acc6,8,$acc6
1018                 xor     $acc4,$s1,$s1
1019                 sll     $acc8,24,$acc8          !
1020                 xor     $acc5,$s1,$s1
1021                 sll     $acc9,16,$acc9
1022                 xor     $acc11,$s2,$s2
1023                 sll     $acc10,8,$acc10
1024                 xor     $acc6,$s1,$s1
1025                 sll     $acc12,24,$acc12
1026                 xor     $acc8,$s2,$s2
1027                 sll     $acc13,16,$acc13        !
1028                 xor     $acc9,$s2,$s2
1029                 sll     $acc14,8,$acc14
1030                 xor     $acc10,$s2,$s2
1031                 xor     $acc12,$acc14,$acc14
1032                 xor     $acc13,$s3,$s3
1033                 xor     $acc14,$s3,$s3
1034                 xor     $acc15,$s3,$s3
1035
1036         ret
1037         restore
1038 .type   _sparcv9_AES_decrypt,#function
1039 .size   _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1040
1041 .align  32
1042 .globl  AES_decrypt
1043 AES_decrypt:
1044         or      %o0,%o1,%g1
1045         andcc   %g1,3,%g0
1046         bnz,pn  %xcc,.Lunaligned_dec
1047         save    %sp,-$frame,%sp
1048
1049         ld      [%i0+0],%o0
1050         ld      [%i0+4],%o1
1051         ld      [%i0+8],%o2
1052         ld      [%i0+12],%o3
1053
1054 1:      call    .+8
1055         mov     %i2,%o5
1056         sub     %o7,1b-AES_Td,%o4
1057         call    _sparcv9_AES_decrypt
1058         nop
1059
1060         st      %o0,[%i1+0]
1061         st      %o1,[%i1+4]
1062         st      %o2,[%i1+8]
1063         st      %o3,[%i1+12]
1064
1065         ret
1066         restore
1067
1068 .align  32
1069 .Lunaligned_dec:
1070         ldub    [%i0+0],%l0
1071         ldub    [%i0+1],%l1
1072         ldub    [%i0+2],%l2
1073
1074         sll     %l0,24,%l0
1075         ldub    [%i0+3],%l3
1076         sll     %l1,16,%l1
1077         ldub    [%i0+4],%l4
1078         sll     %l2,8,%l2
1079         or      %l1,%l0,%l0
1080         ldub    [%i0+5],%l5
1081         sll     %l4,24,%l4
1082         or      %l3,%l2,%l2
1083         ldub    [%i0+6],%l6
1084         sll     %l5,16,%l5
1085         or      %l0,%l2,%o0
1086         ldub    [%i0+7],%l7
1087
1088         sll     %l6,8,%l6
1089         or      %l5,%l4,%l4
1090         ldub    [%i0+8],%l0
1091         or      %l7,%l6,%l6
1092         ldub    [%i0+9],%l1
1093         or      %l4,%l6,%o1
1094         ldub    [%i0+10],%l2
1095
1096         sll     %l0,24,%l0
1097         ldub    [%i0+11],%l3
1098         sll     %l1,16,%l1
1099         ldub    [%i0+12],%l4
1100         sll     %l2,8,%l2
1101         or      %l1,%l0,%l0
1102         ldub    [%i0+13],%l5
1103         sll     %l4,24,%l4
1104         or      %l3,%l2,%l2
1105         ldub    [%i0+14],%l6
1106         sll     %l5,16,%l5
1107         or      %l0,%l2,%o2
1108         ldub    [%i0+15],%l7
1109
1110         sll     %l6,8,%l6
1111         or      %l5,%l4,%l4
1112         or      %l7,%l6,%l6
1113         or      %l4,%l6,%o3
1114
1115 1:      call    .+8
1116         mov     %i2,%o5
1117         sub     %o7,1b-AES_Td,%o4
1118         call    _sparcv9_AES_decrypt
1119         nop
1120
1121         srl     %o0,24,%l0
1122         srl     %o0,16,%l1
1123         stb     %l0,[%i1+0]
1124         srl     %o0,8,%l2
1125         stb     %l1,[%i1+1]
1126         stb     %l2,[%i1+2]
1127         srl     %o1,24,%l4
1128         stb     %o0,[%i1+3]
1129
1130         srl     %o1,16,%l5
1131         stb     %l4,[%i1+4]
1132         srl     %o1,8,%l6
1133         stb     %l5,[%i1+5]
1134         stb     %l6,[%i1+6]
1135         srl     %o2,24,%l0
1136         stb     %o1,[%i1+7]
1137
1138         srl     %o2,16,%l1
1139         stb     %l0,[%i1+8]
1140         srl     %o2,8,%l2
1141         stb     %l1,[%i1+9]
1142         stb     %l2,[%i1+10]
1143         srl     %o3,24,%l4
1144         stb     %o2,[%i1+11]
1145
1146         srl     %o3,16,%l5
1147         stb     %l4,[%i1+12]
1148         srl     %o3,8,%l6
1149         stb     %l5,[%i1+13]
1150         stb     %l6,[%i1+14]
1151         stb     %o3,[%i1+15]
1152
1153         ret
1154         restore
1155 .type   AES_decrypt,#function
1156 .size   AES_decrypt,(.-AES_decrypt)
1157 ___
1158
1159 # fmovs instructions substituting for FP nops were originally added
1160 # to meet specific instruction alignment requirements to maximize ILP.
1161 # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1162 # undesired effect, so just omit them and sacrifice some portion of
1163 # percent in performance...
1164 $code =~ s/fmovs.*$//gem;
1165
1166 print $code;