Remove email addresses from source code.
[openssl.git] / crypto / aes / asm / aes-sparcv9.pl
1 #! /usr/bin/env perl
2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. Rights for redistribution and usage in source and binary
13 # forms are granted according to the OpenSSL license.
14 # ====================================================================
15 #
16 # Version 1.1
17 #
18 # The major reason for undertaken effort was to mitigate the hazard of
19 # cache-timing attack. This is [currently and initially!] addressed in
20 # two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
21 # 2. References to them are scheduled for L2 cache latency, meaning
22 # that the tables don't have to reside in L1 cache. Once again, this
23 # is an initial draft and one should expect more countermeasures to
24 # be implemented...
25 #
26 # Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
27 # round.
28 #
29 # Even though performance was not the primary goal [on the contrary,
30 # extra shifts "induced" by compressed S-box and longer loop epilogue
31 # "induced" by scheduling for L2 have negative effect on performance],
32 # the code turned out to run in ~23 cycles per processed byte en-/
33 # decrypted with 128-bit key. This is pretty good result for code
34 # with mentioned qualities and UltraSPARC core. Compared to Sun C
35 # generated code my encrypt procedure runs just few percents faster,
36 # while decrypt one - whole 50% faster [yes, Sun C failed to generate
37 # optimal decrypt procedure]. Compared to GNU C generated code both
38 # procedures are more than 60% faster:-)
39
40 $output = pop;
41 open STDOUT,">$output";
42
43 $frame="STACK_FRAME";
44 $bias="STACK_BIAS";
45 $locals=16;
46
47 $acc0="%l0";
48 $acc1="%o0";
49 $acc2="%o1";
50 $acc3="%o2";
51
52 $acc4="%l1";
53 $acc5="%o3";
54 $acc6="%o4";
55 $acc7="%o5";
56
57 $acc8="%l2";
58 $acc9="%o7";
59 $acc10="%g1";
60 $acc11="%g2";
61
62 $acc12="%l3";
63 $acc13="%g3";
64 $acc14="%g4";
65 $acc15="%g5";
66
67 $t0="%l4";
68 $t1="%l5";
69 $t2="%l6";
70 $t3="%l7";
71
72 $s0="%i0";
73 $s1="%i1";
74 $s2="%i2";
75 $s3="%i3";
76 $tbl="%i4";
77 $key="%i5";
78 $rounds="%i7";  # aliases with return address, which is off-loaded to stack
79
80 sub _data_word()
81 { my $i;
82     while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
83 }
84
85 $code.=<<___;
86 #include "sparc_arch.h"
87
88 #ifdef  __arch64__
89 .register       %g2,#scratch
90 .register       %g3,#scratch
91 #endif
92 .section        ".text",#alloc,#execinstr
93
94 .align  256
95 AES_Te:
96 ___
97 &_data_word(
98         0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
99         0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
100         0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
101         0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
102         0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
103         0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
104         0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
105         0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
106         0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
107         0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
108         0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
109         0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
110         0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
111         0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
112         0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
113         0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
114         0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
115         0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
116         0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
117         0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
118         0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
119         0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
120         0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
121         0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
122         0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
123         0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
124         0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
125         0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
126         0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
127         0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
128         0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
129         0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
130         0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
131         0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
132         0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
133         0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
134         0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
135         0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
136         0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
137         0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
138         0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
139         0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
140         0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
141         0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
142         0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
143         0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
144         0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
145         0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
146         0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
147         0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
148         0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
149         0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
150         0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
151         0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
152         0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
153         0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
154         0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
155         0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
156         0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
157         0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
158         0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
159         0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
160         0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
161         0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
162 $code.=<<___;
163         .byte   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
164         .byte   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
165         .byte   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
166         .byte   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
167         .byte   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
168         .byte   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
169         .byte   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
170         .byte   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
171         .byte   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
172         .byte   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
173         .byte   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
174         .byte   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
175         .byte   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
176         .byte   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
177         .byte   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
178         .byte   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
179         .byte   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
180         .byte   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
181         .byte   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
182         .byte   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
183         .byte   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
184         .byte   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
185         .byte   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
186         .byte   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
187         .byte   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
188         .byte   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
189         .byte   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
190         .byte   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
191         .byte   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
192         .byte   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
193         .byte   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
194         .byte   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
195 .type   AES_Te,#object
196 .size   AES_Te,(.-AES_Te)
197
198 .align  64
199 .skip   16
200 _sparcv9_AES_encrypt:
201         save    %sp,-$frame-$locals,%sp
202         stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
203         ld      [$key+240],$rounds
204         ld      [$key+0],$t0
205         ld      [$key+4],$t1                    !
206         ld      [$key+8],$t2
207         srl     $rounds,1,$rounds
208         xor     $t0,$s0,$s0
209         ld      [$key+12],$t3
210         srl     $s0,21,$acc0
211         xor     $t1,$s1,$s1
212         ld      [$key+16],$t0
213         srl     $s1,13,$acc1                    !
214         xor     $t2,$s2,$s2
215         ld      [$key+20],$t1
216         xor     $t3,$s3,$s3
217         ld      [$key+24],$t2
218         and     $acc0,2040,$acc0
219         ld      [$key+28],$t3
220         nop
221 .Lenc_loop:
222         srl     $s2,5,$acc2                     !
223         and     $acc1,2040,$acc1
224         ldx     [$tbl+$acc0],$acc0
225         sll     $s3,3,$acc3
226         and     $acc2,2040,$acc2
227         ldx     [$tbl+$acc1],$acc1
228         srl     $s1,21,$acc4
229         and     $acc3,2040,$acc3
230         ldx     [$tbl+$acc2],$acc2              !
231         srl     $s2,13,$acc5
232         and     $acc4,2040,$acc4
233         ldx     [$tbl+$acc3],$acc3
234         srl     $s3,5,$acc6
235         and     $acc5,2040,$acc5
236         ldx     [$tbl+$acc4],$acc4
237         fmovs   %f0,%f0
238         sll     $s0,3,$acc7                     !
239         and     $acc6,2040,$acc6
240         ldx     [$tbl+$acc5],$acc5
241         srl     $s2,21,$acc8
242         and     $acc7,2040,$acc7
243         ldx     [$tbl+$acc6],$acc6
244         srl     $s3,13,$acc9
245         and     $acc8,2040,$acc8
246         ldx     [$tbl+$acc7],$acc7              !
247         srl     $s0,5,$acc10
248         and     $acc9,2040,$acc9
249         ldx     [$tbl+$acc8],$acc8
250         sll     $s1,3,$acc11
251         and     $acc10,2040,$acc10
252         ldx     [$tbl+$acc9],$acc9
253         fmovs   %f0,%f0
254         srl     $s3,21,$acc12                   !
255         and     $acc11,2040,$acc11
256         ldx     [$tbl+$acc10],$acc10
257         srl     $s0,13,$acc13
258         and     $acc12,2040,$acc12
259         ldx     [$tbl+$acc11],$acc11
260         srl     $s1,5,$acc14
261         and     $acc13,2040,$acc13
262         ldx     [$tbl+$acc12],$acc12            !
263         sll     $s2,3,$acc15
264         and     $acc14,2040,$acc14
265         ldx     [$tbl+$acc13],$acc13
266         and     $acc15,2040,$acc15
267         add     $key,32,$key
268         ldx     [$tbl+$acc14],$acc14
269         fmovs   %f0,%f0
270         subcc   $rounds,1,$rounds               !
271         ldx     [$tbl+$acc15],$acc15
272         bz,a,pn %icc,.Lenc_last
273         add     $tbl,2048,$rounds
274
275                 srlx    $acc1,8,$acc1
276                 xor     $acc0,$t0,$t0
277         ld      [$key+0],$s0
278         fmovs   %f0,%f0
279                 srlx    $acc2,16,$acc2          !
280                 xor     $acc1,$t0,$t0
281         ld      [$key+4],$s1
282                 srlx    $acc3,24,$acc3
283                 xor     $acc2,$t0,$t0
284         ld      [$key+8],$s2
285                 srlx    $acc5,8,$acc5
286                 xor     $acc3,$t0,$t0
287         ld      [$key+12],$s3                   !
288                 srlx    $acc6,16,$acc6
289                 xor     $acc4,$t1,$t1
290         fmovs   %f0,%f0
291                 srlx    $acc7,24,$acc7
292                 xor     $acc5,$t1,$t1
293                 srlx    $acc9,8,$acc9
294                 xor     $acc6,$t1,$t1
295                 srlx    $acc10,16,$acc10        !
296                 xor     $acc7,$t1,$t1
297                 srlx    $acc11,24,$acc11
298                 xor     $acc8,$t2,$t2
299                 srlx    $acc13,8,$acc13
300                 xor     $acc9,$t2,$t2
301                 srlx    $acc14,16,$acc14
302                 xor     $acc10,$t2,$t2
303                 srlx    $acc15,24,$acc15        !
304                 xor     $acc11,$t2,$t2
305                 xor     $acc12,$acc14,$acc14
306                 xor     $acc13,$t3,$t3
307         srl     $t0,21,$acc0
308                 xor     $acc14,$t3,$t3
309         srl     $t1,13,$acc1
310                 xor     $acc15,$t3,$t3
311
312         and     $acc0,2040,$acc0                !
313         srl     $t2,5,$acc2
314         and     $acc1,2040,$acc1
315         ldx     [$tbl+$acc0],$acc0
316         sll     $t3,3,$acc3
317         and     $acc2,2040,$acc2
318         ldx     [$tbl+$acc1],$acc1
319         fmovs   %f0,%f0
320         srl     $t1,21,$acc4                    !
321         and     $acc3,2040,$acc3
322         ldx     [$tbl+$acc2],$acc2
323         srl     $t2,13,$acc5
324         and     $acc4,2040,$acc4
325         ldx     [$tbl+$acc3],$acc3
326         srl     $t3,5,$acc6
327         and     $acc5,2040,$acc5
328         ldx     [$tbl+$acc4],$acc4              !
329         sll     $t0,3,$acc7
330         and     $acc6,2040,$acc6
331         ldx     [$tbl+$acc5],$acc5
332         srl     $t2,21,$acc8
333         and     $acc7,2040,$acc7
334         ldx     [$tbl+$acc6],$acc6
335         fmovs   %f0,%f0
336         srl     $t3,13,$acc9                    !
337         and     $acc8,2040,$acc8
338         ldx     [$tbl+$acc7],$acc7
339         srl     $t0,5,$acc10
340         and     $acc9,2040,$acc9
341         ldx     [$tbl+$acc8],$acc8
342         sll     $t1,3,$acc11
343         and     $acc10,2040,$acc10
344         ldx     [$tbl+$acc9],$acc9              !
345         srl     $t3,21,$acc12
346         and     $acc11,2040,$acc11
347         ldx     [$tbl+$acc10],$acc10
348         srl     $t0,13,$acc13
349         and     $acc12,2040,$acc12
350         ldx     [$tbl+$acc11],$acc11
351         fmovs   %f0,%f0
352         srl     $t1,5,$acc14                    !
353         and     $acc13,2040,$acc13
354         ldx     [$tbl+$acc12],$acc12
355         sll     $t2,3,$acc15
356         and     $acc14,2040,$acc14
357         ldx     [$tbl+$acc13],$acc13
358                 srlx    $acc1,8,$acc1
359         and     $acc15,2040,$acc15
360         ldx     [$tbl+$acc14],$acc14            !
361
362                 srlx    $acc2,16,$acc2
363                 xor     $acc0,$s0,$s0
364         ldx     [$tbl+$acc15],$acc15
365                 srlx    $acc3,24,$acc3
366                 xor     $acc1,$s0,$s0
367         ld      [$key+16],$t0
368         fmovs   %f0,%f0
369                 srlx    $acc5,8,$acc5           !
370                 xor     $acc2,$s0,$s0
371         ld      [$key+20],$t1
372                 srlx    $acc6,16,$acc6
373                 xor     $acc3,$s0,$s0
374         ld      [$key+24],$t2
375                 srlx    $acc7,24,$acc7
376                 xor     $acc4,$s1,$s1
377         ld      [$key+28],$t3                   !
378                 srlx    $acc9,8,$acc9
379                 xor     $acc5,$s1,$s1
380         ldx     [$tbl+2048+0],%g0               ! prefetch te4
381                 srlx    $acc10,16,$acc10
382                 xor     $acc6,$s1,$s1
383         ldx     [$tbl+2048+32],%g0              ! prefetch te4
384                 srlx    $acc11,24,$acc11
385                 xor     $acc7,$s1,$s1
386         ldx     [$tbl+2048+64],%g0              ! prefetch te4
387                 srlx    $acc13,8,$acc13
388                 xor     $acc8,$s2,$s2
389         ldx     [$tbl+2048+96],%g0              ! prefetch te4
390                 srlx    $acc14,16,$acc14        !
391                 xor     $acc9,$s2,$s2
392         ldx     [$tbl+2048+128],%g0             ! prefetch te4
393                 srlx    $acc15,24,$acc15
394                 xor     $acc10,$s2,$s2
395         ldx     [$tbl+2048+160],%g0             ! prefetch te4
396         srl     $s0,21,$acc0
397                 xor     $acc11,$s2,$s2
398         ldx     [$tbl+2048+192],%g0             ! prefetch te4
399                 xor     $acc12,$acc14,$acc14
400                 xor     $acc13,$s3,$s3
401         ldx     [$tbl+2048+224],%g0             ! prefetch te4
402         srl     $s1,13,$acc1                    !
403                 xor     $acc14,$s3,$s3
404                 xor     $acc15,$s3,$s3
405         ba      .Lenc_loop
406         and     $acc0,2040,$acc0
407
408 .align  32
409 .Lenc_last:
410                 srlx    $acc1,8,$acc1           !
411                 xor     $acc0,$t0,$t0
412         ld      [$key+0],$s0
413                 srlx    $acc2,16,$acc2
414                 xor     $acc1,$t0,$t0
415         ld      [$key+4],$s1
416                 srlx    $acc3,24,$acc3
417                 xor     $acc2,$t0,$t0
418         ld      [$key+8],$s2                    !
419                 srlx    $acc5,8,$acc5
420                 xor     $acc3,$t0,$t0
421         ld      [$key+12],$s3
422                 srlx    $acc6,16,$acc6
423                 xor     $acc4,$t1,$t1
424                 srlx    $acc7,24,$acc7
425                 xor     $acc5,$t1,$t1
426                 srlx    $acc9,8,$acc9           !
427                 xor     $acc6,$t1,$t1
428                 srlx    $acc10,16,$acc10
429                 xor     $acc7,$t1,$t1
430                 srlx    $acc11,24,$acc11
431                 xor     $acc8,$t2,$t2
432                 srlx    $acc13,8,$acc13
433                 xor     $acc9,$t2,$t2
434                 srlx    $acc14,16,$acc14        !
435                 xor     $acc10,$t2,$t2
436                 srlx    $acc15,24,$acc15
437                 xor     $acc11,$t2,$t2
438                 xor     $acc12,$acc14,$acc14
439                 xor     $acc13,$t3,$t3
440         srl     $t0,24,$acc0
441                 xor     $acc14,$t3,$t3
442         srl     $t1,16,$acc1                    !
443                 xor     $acc15,$t3,$t3
444
445         srl     $t2,8,$acc2
446         and     $acc1,255,$acc1
447         ldub    [$rounds+$acc0],$acc0
448         srl     $t1,24,$acc4
449         and     $acc2,255,$acc2
450         ldub    [$rounds+$acc1],$acc1
451         srl     $t2,16,$acc5                    !
452         and     $t3,255,$acc3
453         ldub    [$rounds+$acc2],$acc2
454         ldub    [$rounds+$acc3],$acc3
455         srl     $t3,8,$acc6
456         and     $acc5,255,$acc5
457         ldub    [$rounds+$acc4],$acc4
458         fmovs   %f0,%f0
459         srl     $t2,24,$acc8                    !
460         and     $acc6,255,$acc6
461         ldub    [$rounds+$acc5],$acc5
462         srl     $t3,16,$acc9
463         and     $t0,255,$acc7
464         ldub    [$rounds+$acc6],$acc6
465         ldub    [$rounds+$acc7],$acc7
466         fmovs   %f0,%f0
467         srl     $t0,8,$acc10                    !
468         and     $acc9,255,$acc9
469         ldub    [$rounds+$acc8],$acc8
470         srl     $t3,24,$acc12
471         and     $acc10,255,$acc10
472         ldub    [$rounds+$acc9],$acc9
473         srl     $t0,16,$acc13
474         and     $t1,255,$acc11
475         ldub    [$rounds+$acc10],$acc10         !
476         srl     $t1,8,$acc14
477         and     $acc13,255,$acc13
478         ldub    [$rounds+$acc11],$acc11
479         ldub    [$rounds+$acc12],$acc12
480         and     $acc14,255,$acc14
481         ldub    [$rounds+$acc13],$acc13
482         and     $t2,255,$acc15
483         ldub    [$rounds+$acc14],$acc14         !
484
485                 sll     $acc0,24,$acc0
486                 xor     $acc3,$s0,$s0
487         ldub    [$rounds+$acc15],$acc15
488                 sll     $acc1,16,$acc1
489                 xor     $acc0,$s0,$s0
490         ldx     [%sp+$bias+$frame+0],%i7        ! restore return address
491         fmovs   %f0,%f0
492                 sll     $acc2,8,$acc2           !
493                 xor     $acc1,$s0,$s0
494                 sll     $acc4,24,$acc4
495                 xor     $acc2,$s0,$s0
496                 sll     $acc5,16,$acc5
497                 xor     $acc7,$s1,$s1
498                 sll     $acc6,8,$acc6
499                 xor     $acc4,$s1,$s1
500                 sll     $acc8,24,$acc8          !
501                 xor     $acc5,$s1,$s1
502                 sll     $acc9,16,$acc9
503                 xor     $acc11,$s2,$s2
504                 sll     $acc10,8,$acc10
505                 xor     $acc6,$s1,$s1
506                 sll     $acc12,24,$acc12
507                 xor     $acc8,$s2,$s2
508                 sll     $acc13,16,$acc13        !
509                 xor     $acc9,$s2,$s2
510                 sll     $acc14,8,$acc14
511                 xor     $acc10,$s2,$s2
512                 xor     $acc12,$acc14,$acc14
513                 xor     $acc13,$s3,$s3
514                 xor     $acc14,$s3,$s3
515                 xor     $acc15,$s3,$s3
516
517         ret
518         restore
519 .type   _sparcv9_AES_encrypt,#function
520 .size   _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
521
522 .align  32
523 .globl  AES_encrypt
524 AES_encrypt:
525         or      %o0,%o1,%g1
526         andcc   %g1,3,%g0
527         bnz,pn  %xcc,.Lunaligned_enc
528         save    %sp,-$frame,%sp
529
530         ld      [%i0+0],%o0
531         ld      [%i0+4],%o1
532         ld      [%i0+8],%o2
533         ld      [%i0+12],%o3
534
535 1:      call    .+8
536         add     %o7,AES_Te-1b,%o4
537         call    _sparcv9_AES_encrypt
538         mov     %i2,%o5
539
540         st      %o0,[%i1+0]
541         st      %o1,[%i1+4]
542         st      %o2,[%i1+8]
543         st      %o3,[%i1+12]
544
545         ret
546         restore
547
548 .align  32
549 .Lunaligned_enc:
550         ldub    [%i0+0],%l0
551         ldub    [%i0+1],%l1
552         ldub    [%i0+2],%l2
553
554         sll     %l0,24,%l0
555         ldub    [%i0+3],%l3
556         sll     %l1,16,%l1
557         ldub    [%i0+4],%l4
558         sll     %l2,8,%l2
559         or      %l1,%l0,%l0
560         ldub    [%i0+5],%l5
561         sll     %l4,24,%l4
562         or      %l3,%l2,%l2
563         ldub    [%i0+6],%l6
564         sll     %l5,16,%l5
565         or      %l0,%l2,%o0
566         ldub    [%i0+7],%l7
567
568         sll     %l6,8,%l6
569         or      %l5,%l4,%l4
570         ldub    [%i0+8],%l0
571         or      %l7,%l6,%l6
572         ldub    [%i0+9],%l1
573         or      %l4,%l6,%o1
574         ldub    [%i0+10],%l2
575
576         sll     %l0,24,%l0
577         ldub    [%i0+11],%l3
578         sll     %l1,16,%l1
579         ldub    [%i0+12],%l4
580         sll     %l2,8,%l2
581         or      %l1,%l0,%l0
582         ldub    [%i0+13],%l5
583         sll     %l4,24,%l4
584         or      %l3,%l2,%l2
585         ldub    [%i0+14],%l6
586         sll     %l5,16,%l5
587         or      %l0,%l2,%o2
588         ldub    [%i0+15],%l7
589
590         sll     %l6,8,%l6
591         or      %l5,%l4,%l4
592         or      %l7,%l6,%l6
593         or      %l4,%l6,%o3
594
595 1:      call    .+8
596         add     %o7,AES_Te-1b,%o4
597         call    _sparcv9_AES_encrypt
598         mov     %i2,%o5
599
600         srl     %o0,24,%l0
601         srl     %o0,16,%l1
602         stb     %l0,[%i1+0]
603         srl     %o0,8,%l2
604         stb     %l1,[%i1+1]
605         stb     %l2,[%i1+2]
606         srl     %o1,24,%l4
607         stb     %o0,[%i1+3]
608
609         srl     %o1,16,%l5
610         stb     %l4,[%i1+4]
611         srl     %o1,8,%l6
612         stb     %l5,[%i1+5]
613         stb     %l6,[%i1+6]
614         srl     %o2,24,%l0
615         stb     %o1,[%i1+7]
616
617         srl     %o2,16,%l1
618         stb     %l0,[%i1+8]
619         srl     %o2,8,%l2
620         stb     %l1,[%i1+9]
621         stb     %l2,[%i1+10]
622         srl     %o3,24,%l4
623         stb     %o2,[%i1+11]
624
625         srl     %o3,16,%l5
626         stb     %l4,[%i1+12]
627         srl     %o3,8,%l6
628         stb     %l5,[%i1+13]
629         stb     %l6,[%i1+14]
630         stb     %o3,[%i1+15]
631
632         ret
633         restore
634 .type   AES_encrypt,#function
635 .size   AES_encrypt,(.-AES_encrypt)
636
637 ___
638
639 $code.=<<___;
640 .align  256
641 AES_Td:
642 ___
643 &_data_word(
644         0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
645         0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
646         0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
647         0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
648         0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
649         0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
650         0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
651         0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
652         0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
653         0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
654         0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
655         0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
656         0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
657         0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
658         0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
659         0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
660         0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
661         0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
662         0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
663         0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
664         0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
665         0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
666         0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
667         0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
668         0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
669         0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
670         0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
671         0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
672         0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
673         0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
674         0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
675         0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
676         0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
677         0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
678         0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
679         0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
680         0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
681         0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
682         0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
683         0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
684         0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
685         0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
686         0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
687         0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
688         0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
689         0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
690         0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
691         0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
692         0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
693         0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
694         0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
695         0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
696         0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
697         0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
698         0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
699         0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
700         0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
701         0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
702         0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
703         0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
704         0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
705         0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
706         0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
707         0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
708 $code.=<<___;
709         .byte   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
710         .byte   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
711         .byte   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
712         .byte   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
713         .byte   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
714         .byte   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
715         .byte   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
716         .byte   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
717         .byte   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
718         .byte   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
719         .byte   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
720         .byte   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
721         .byte   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
722         .byte   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
723         .byte   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
724         .byte   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
725         .byte   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
726         .byte   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
727         .byte   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
728         .byte   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
729         .byte   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
730         .byte   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
731         .byte   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
732         .byte   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
733         .byte   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
734         .byte   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
735         .byte   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
736         .byte   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
737         .byte   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
738         .byte   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
739         .byte   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
740         .byte   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
741 .type   AES_Td,#object
742 .size   AES_Td,(.-AES_Td)
743
744 .align  64
745 .skip   16
746 _sparcv9_AES_decrypt:
747         save    %sp,-$frame-$locals,%sp
748         stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
749         ld      [$key+240],$rounds
750         ld      [$key+0],$t0
751         ld      [$key+4],$t1                    !
752         ld      [$key+8],$t2
753         ld      [$key+12],$t3
754         srl     $rounds,1,$rounds
755         xor     $t0,$s0,$s0
756         ld      [$key+16],$t0
757         xor     $t1,$s1,$s1
758         ld      [$key+20],$t1
759         srl     $s0,21,$acc0                    !
760         xor     $t2,$s2,$s2
761         ld      [$key+24],$t2
762         xor     $t3,$s3,$s3
763         and     $acc0,2040,$acc0
764         ld      [$key+28],$t3
765         srl     $s3,13,$acc1
766         nop
767 .Ldec_loop:
768         srl     $s2,5,$acc2                     !
769         and     $acc1,2040,$acc1
770         ldx     [$tbl+$acc0],$acc0
771         sll     $s1,3,$acc3
772         and     $acc2,2040,$acc2
773         ldx     [$tbl+$acc1],$acc1
774         srl     $s1,21,$acc4
775         and     $acc3,2040,$acc3
776         ldx     [$tbl+$acc2],$acc2              !
777         srl     $s0,13,$acc5
778         and     $acc4,2040,$acc4
779         ldx     [$tbl+$acc3],$acc3
780         srl     $s3,5,$acc6
781         and     $acc5,2040,$acc5
782         ldx     [$tbl+$acc4],$acc4
783         fmovs   %f0,%f0
784         sll     $s2,3,$acc7                     !
785         and     $acc6,2040,$acc6
786         ldx     [$tbl+$acc5],$acc5
787         srl     $s2,21,$acc8
788         and     $acc7,2040,$acc7
789         ldx     [$tbl+$acc6],$acc6
790         srl     $s1,13,$acc9
791         and     $acc8,2040,$acc8
792         ldx     [$tbl+$acc7],$acc7              !
793         srl     $s0,5,$acc10
794         and     $acc9,2040,$acc9
795         ldx     [$tbl+$acc8],$acc8
796         sll     $s3,3,$acc11
797         and     $acc10,2040,$acc10
798         ldx     [$tbl+$acc9],$acc9
799         fmovs   %f0,%f0
800         srl     $s3,21,$acc12                   !
801         and     $acc11,2040,$acc11
802         ldx     [$tbl+$acc10],$acc10
803         srl     $s2,13,$acc13
804         and     $acc12,2040,$acc12
805         ldx     [$tbl+$acc11],$acc11
806         srl     $s1,5,$acc14
807         and     $acc13,2040,$acc13
808         ldx     [$tbl+$acc12],$acc12            !
809         sll     $s0,3,$acc15
810         and     $acc14,2040,$acc14
811         ldx     [$tbl+$acc13],$acc13
812         and     $acc15,2040,$acc15
813         add     $key,32,$key
814         ldx     [$tbl+$acc14],$acc14
815         fmovs   %f0,%f0
816         subcc   $rounds,1,$rounds               !
817         ldx     [$tbl+$acc15],$acc15
818         bz,a,pn %icc,.Ldec_last
819         add     $tbl,2048,$rounds
820
821                 srlx    $acc1,8,$acc1
822                 xor     $acc0,$t0,$t0
823         ld      [$key+0],$s0
824         fmovs   %f0,%f0
825                 srlx    $acc2,16,$acc2          !
826                 xor     $acc1,$t0,$t0
827         ld      [$key+4],$s1
828                 srlx    $acc3,24,$acc3
829                 xor     $acc2,$t0,$t0
830         ld      [$key+8],$s2
831                 srlx    $acc5,8,$acc5
832                 xor     $acc3,$t0,$t0
833         ld      [$key+12],$s3                   !
834                 srlx    $acc6,16,$acc6
835                 xor     $acc4,$t1,$t1
836         fmovs   %f0,%f0
837                 srlx    $acc7,24,$acc7
838                 xor     $acc5,$t1,$t1
839                 srlx    $acc9,8,$acc9
840                 xor     $acc6,$t1,$t1
841                 srlx    $acc10,16,$acc10        !
842                 xor     $acc7,$t1,$t1
843                 srlx    $acc11,24,$acc11
844                 xor     $acc8,$t2,$t2
845                 srlx    $acc13,8,$acc13
846                 xor     $acc9,$t2,$t2
847                 srlx    $acc14,16,$acc14
848                 xor     $acc10,$t2,$t2
849                 srlx    $acc15,24,$acc15        !
850                 xor     $acc11,$t2,$t2
851                 xor     $acc12,$acc14,$acc14
852                 xor     $acc13,$t3,$t3
853         srl     $t0,21,$acc0
854                 xor     $acc14,$t3,$t3
855                 xor     $acc15,$t3,$t3
856         srl     $t3,13,$acc1
857
858         and     $acc0,2040,$acc0                !
859         srl     $t2,5,$acc2
860         and     $acc1,2040,$acc1
861         ldx     [$tbl+$acc0],$acc0
862         sll     $t1,3,$acc3
863         and     $acc2,2040,$acc2
864         ldx     [$tbl+$acc1],$acc1
865         fmovs   %f0,%f0
866         srl     $t1,21,$acc4                    !
867         and     $acc3,2040,$acc3
868         ldx     [$tbl+$acc2],$acc2
869         srl     $t0,13,$acc5
870         and     $acc4,2040,$acc4
871         ldx     [$tbl+$acc3],$acc3
872         srl     $t3,5,$acc6
873         and     $acc5,2040,$acc5
874         ldx     [$tbl+$acc4],$acc4              !
875         sll     $t2,3,$acc7
876         and     $acc6,2040,$acc6
877         ldx     [$tbl+$acc5],$acc5
878         srl     $t2,21,$acc8
879         and     $acc7,2040,$acc7
880         ldx     [$tbl+$acc6],$acc6
881         fmovs   %f0,%f0
882         srl     $t1,13,$acc9                    !
883         and     $acc8,2040,$acc8
884         ldx     [$tbl+$acc7],$acc7
885         srl     $t0,5,$acc10
886         and     $acc9,2040,$acc9
887         ldx     [$tbl+$acc8],$acc8
888         sll     $t3,3,$acc11
889         and     $acc10,2040,$acc10
890         ldx     [$tbl+$acc9],$acc9              !
891         srl     $t3,21,$acc12
892         and     $acc11,2040,$acc11
893         ldx     [$tbl+$acc10],$acc10
894         srl     $t2,13,$acc13
895         and     $acc12,2040,$acc12
896         ldx     [$tbl+$acc11],$acc11
897         fmovs   %f0,%f0
898         srl     $t1,5,$acc14                    !
899         and     $acc13,2040,$acc13
900         ldx     [$tbl+$acc12],$acc12
901         sll     $t0,3,$acc15
902         and     $acc14,2040,$acc14
903         ldx     [$tbl+$acc13],$acc13
904                 srlx    $acc1,8,$acc1
905         and     $acc15,2040,$acc15
906         ldx     [$tbl+$acc14],$acc14            !
907
908                 srlx    $acc2,16,$acc2
909                 xor     $acc0,$s0,$s0
910         ldx     [$tbl+$acc15],$acc15
911                 srlx    $acc3,24,$acc3
912                 xor     $acc1,$s0,$s0
913         ld      [$key+16],$t0
914         fmovs   %f0,%f0
915                 srlx    $acc5,8,$acc5           !
916                 xor     $acc2,$s0,$s0
917         ld      [$key+20],$t1
918                 srlx    $acc6,16,$acc6
919                 xor     $acc3,$s0,$s0
920         ld      [$key+24],$t2
921                 srlx    $acc7,24,$acc7
922                 xor     $acc4,$s1,$s1
923         ld      [$key+28],$t3                   !
924                 srlx    $acc9,8,$acc9
925                 xor     $acc5,$s1,$s1
926         ldx     [$tbl+2048+0],%g0               ! prefetch td4
927                 srlx    $acc10,16,$acc10
928                 xor     $acc6,$s1,$s1
929         ldx     [$tbl+2048+32],%g0              ! prefetch td4
930                 srlx    $acc11,24,$acc11
931                 xor     $acc7,$s1,$s1
932         ldx     [$tbl+2048+64],%g0              ! prefetch td4
933                 srlx    $acc13,8,$acc13
934                 xor     $acc8,$s2,$s2
935         ldx     [$tbl+2048+96],%g0              ! prefetch td4
936                 srlx    $acc14,16,$acc14        !
937                 xor     $acc9,$s2,$s2
938         ldx     [$tbl+2048+128],%g0             ! prefetch td4
939                 srlx    $acc15,24,$acc15
940                 xor     $acc10,$s2,$s2
941         ldx     [$tbl+2048+160],%g0             ! prefetch td4
942         srl     $s0,21,$acc0
943                 xor     $acc11,$s2,$s2
944         ldx     [$tbl+2048+192],%g0             ! prefetch td4
945                 xor     $acc12,$acc14,$acc14
946                 xor     $acc13,$s3,$s3
947         ldx     [$tbl+2048+224],%g0             ! prefetch td4
948         and     $acc0,2040,$acc0                !
949                 xor     $acc14,$s3,$s3
950                 xor     $acc15,$s3,$s3
951         ba      .Ldec_loop
952         srl     $s3,13,$acc1
953
954 .align  32
955 .Ldec_last:
956                 srlx    $acc1,8,$acc1           !
957                 xor     $acc0,$t0,$t0
958         ld      [$key+0],$s0
959                 srlx    $acc2,16,$acc2
960                 xor     $acc1,$t0,$t0
961         ld      [$key+4],$s1
962                 srlx    $acc3,24,$acc3
963                 xor     $acc2,$t0,$t0
964         ld      [$key+8],$s2                    !
965                 srlx    $acc5,8,$acc5
966                 xor     $acc3,$t0,$t0
967         ld      [$key+12],$s3
968                 srlx    $acc6,16,$acc6
969                 xor     $acc4,$t1,$t1
970                 srlx    $acc7,24,$acc7
971                 xor     $acc5,$t1,$t1
972                 srlx    $acc9,8,$acc9           !
973                 xor     $acc6,$t1,$t1
974                 srlx    $acc10,16,$acc10
975                 xor     $acc7,$t1,$t1
976                 srlx    $acc11,24,$acc11
977                 xor     $acc8,$t2,$t2
978                 srlx    $acc13,8,$acc13
979                 xor     $acc9,$t2,$t2
980                 srlx    $acc14,16,$acc14        !
981                 xor     $acc10,$t2,$t2
982                 srlx    $acc15,24,$acc15
983                 xor     $acc11,$t2,$t2
984                 xor     $acc12,$acc14,$acc14
985                 xor     $acc13,$t3,$t3
986         srl     $t0,24,$acc0
987                 xor     $acc14,$t3,$t3
988                 xor     $acc15,$t3,$t3          !
989         srl     $t3,16,$acc1
990
991         srl     $t2,8,$acc2
992         and     $acc1,255,$acc1
993         ldub    [$rounds+$acc0],$acc0
994         srl     $t1,24,$acc4
995         and     $acc2,255,$acc2
996         ldub    [$rounds+$acc1],$acc1
997         srl     $t0,16,$acc5                    !
998         and     $t1,255,$acc3
999         ldub    [$rounds+$acc2],$acc2
1000         ldub    [$rounds+$acc3],$acc3
1001         srl     $t3,8,$acc6
1002         and     $acc5,255,$acc5
1003         ldub    [$rounds+$acc4],$acc4
1004         fmovs   %f0,%f0
1005         srl     $t2,24,$acc8                    !
1006         and     $acc6,255,$acc6
1007         ldub    [$rounds+$acc5],$acc5
1008         srl     $t1,16,$acc9
1009         and     $t2,255,$acc7
1010         ldub    [$rounds+$acc6],$acc6
1011         ldub    [$rounds+$acc7],$acc7
1012         fmovs   %f0,%f0
1013         srl     $t0,8,$acc10                    !
1014         and     $acc9,255,$acc9
1015         ldub    [$rounds+$acc8],$acc8
1016         srl     $t3,24,$acc12
1017         and     $acc10,255,$acc10
1018         ldub    [$rounds+$acc9],$acc9
1019         srl     $t2,16,$acc13
1020         and     $t3,255,$acc11
1021         ldub    [$rounds+$acc10],$acc10         !
1022         srl     $t1,8,$acc14
1023         and     $acc13,255,$acc13
1024         ldub    [$rounds+$acc11],$acc11
1025         ldub    [$rounds+$acc12],$acc12
1026         and     $acc14,255,$acc14
1027         ldub    [$rounds+$acc13],$acc13
1028         and     $t0,255,$acc15
1029         ldub    [$rounds+$acc14],$acc14         !
1030
1031                 sll     $acc0,24,$acc0
1032                 xor     $acc3,$s0,$s0
1033         ldub    [$rounds+$acc15],$acc15
1034                 sll     $acc1,16,$acc1
1035                 xor     $acc0,$s0,$s0
1036         ldx     [%sp+$bias+$frame+0],%i7        ! restore return address
1037         fmovs   %f0,%f0
1038                 sll     $acc2,8,$acc2           !
1039                 xor     $acc1,$s0,$s0
1040                 sll     $acc4,24,$acc4
1041                 xor     $acc2,$s0,$s0
1042                 sll     $acc5,16,$acc5
1043                 xor     $acc7,$s1,$s1
1044                 sll     $acc6,8,$acc6
1045                 xor     $acc4,$s1,$s1
1046                 sll     $acc8,24,$acc8          !
1047                 xor     $acc5,$s1,$s1
1048                 sll     $acc9,16,$acc9
1049                 xor     $acc11,$s2,$s2
1050                 sll     $acc10,8,$acc10
1051                 xor     $acc6,$s1,$s1
1052                 sll     $acc12,24,$acc12
1053                 xor     $acc8,$s2,$s2
1054                 sll     $acc13,16,$acc13        !
1055                 xor     $acc9,$s2,$s2
1056                 sll     $acc14,8,$acc14
1057                 xor     $acc10,$s2,$s2
1058                 xor     $acc12,$acc14,$acc14
1059                 xor     $acc13,$s3,$s3
1060                 xor     $acc14,$s3,$s3
1061                 xor     $acc15,$s3,$s3
1062
1063         ret
1064         restore
1065 .type   _sparcv9_AES_decrypt,#function
1066 .size   _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1067
1068 .align  32
1069 .globl  AES_decrypt
1070 AES_decrypt:
1071         or      %o0,%o1,%g1
1072         andcc   %g1,3,%g0
1073         bnz,pn  %xcc,.Lunaligned_dec
1074         save    %sp,-$frame,%sp
1075
1076         ld      [%i0+0],%o0
1077         ld      [%i0+4],%o1
1078         ld      [%i0+8],%o2
1079         ld      [%i0+12],%o3
1080
1081 1:      call    .+8
1082         add     %o7,AES_Td-1b,%o4
1083         call    _sparcv9_AES_decrypt
1084         mov     %i2,%o5
1085
1086         st      %o0,[%i1+0]
1087         st      %o1,[%i1+4]
1088         st      %o2,[%i1+8]
1089         st      %o3,[%i1+12]
1090
1091         ret
1092         restore
1093
1094 .align  32
1095 .Lunaligned_dec:
1096         ldub    [%i0+0],%l0
1097         ldub    [%i0+1],%l1
1098         ldub    [%i0+2],%l2
1099
1100         sll     %l0,24,%l0
1101         ldub    [%i0+3],%l3
1102         sll     %l1,16,%l1
1103         ldub    [%i0+4],%l4
1104         sll     %l2,8,%l2
1105         or      %l1,%l0,%l0
1106         ldub    [%i0+5],%l5
1107         sll     %l4,24,%l4
1108         or      %l3,%l2,%l2
1109         ldub    [%i0+6],%l6
1110         sll     %l5,16,%l5
1111         or      %l0,%l2,%o0
1112         ldub    [%i0+7],%l7
1113
1114         sll     %l6,8,%l6
1115         or      %l5,%l4,%l4
1116         ldub    [%i0+8],%l0
1117         or      %l7,%l6,%l6
1118         ldub    [%i0+9],%l1
1119         or      %l4,%l6,%o1
1120         ldub    [%i0+10],%l2
1121
1122         sll     %l0,24,%l0
1123         ldub    [%i0+11],%l3
1124         sll     %l1,16,%l1
1125         ldub    [%i0+12],%l4
1126         sll     %l2,8,%l2
1127         or      %l1,%l0,%l0
1128         ldub    [%i0+13],%l5
1129         sll     %l4,24,%l4
1130         or      %l3,%l2,%l2
1131         ldub    [%i0+14],%l6
1132         sll     %l5,16,%l5
1133         or      %l0,%l2,%o2
1134         ldub    [%i0+15],%l7
1135
1136         sll     %l6,8,%l6
1137         or      %l5,%l4,%l4
1138         or      %l7,%l6,%l6
1139         or      %l4,%l6,%o3
1140
1141 1:      call    .+8
1142         add     %o7,AES_Td-1b,%o4
1143         call    _sparcv9_AES_decrypt
1144         mov     %i2,%o5
1145
1146         srl     %o0,24,%l0
1147         srl     %o0,16,%l1
1148         stb     %l0,[%i1+0]
1149         srl     %o0,8,%l2
1150         stb     %l1,[%i1+1]
1151         stb     %l2,[%i1+2]
1152         srl     %o1,24,%l4
1153         stb     %o0,[%i1+3]
1154
1155         srl     %o1,16,%l5
1156         stb     %l4,[%i1+4]
1157         srl     %o1,8,%l6
1158         stb     %l5,[%i1+5]
1159         stb     %l6,[%i1+6]
1160         srl     %o2,24,%l0
1161         stb     %o1,[%i1+7]
1162
1163         srl     %o2,16,%l1
1164         stb     %l0,[%i1+8]
1165         srl     %o2,8,%l2
1166         stb     %l1,[%i1+9]
1167         stb     %l2,[%i1+10]
1168         srl     %o3,24,%l4
1169         stb     %o2,[%i1+11]
1170
1171         srl     %o3,16,%l5
1172         stb     %l4,[%i1+12]
1173         srl     %o3,8,%l6
1174         stb     %l5,[%i1+13]
1175         stb     %l6,[%i1+14]
1176         stb     %o3,[%i1+15]
1177
1178         ret
1179         restore
1180 .type   AES_decrypt,#function
1181 .size   AES_decrypt,(.-AES_decrypt)
1182 ___
1183
1184 # fmovs instructions substituting for FP nops were originally added
1185 # to meet specific instruction alignment requirements to maximize ILP.
1186 # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1187 # undesired effect, so just omit them and sacrifice some portion of
1188 # percent in performance...
1189 $code =~ s/fmovs.*$//gm;
1190
1191 print $code;
1192 close STDOUT;   # ensure flush