f57a9a3fea2577112dcee6ef9b5d5102c54dfa53
[openssl.git] / crypto / aes / asm / aes-sparcv9.pl
1 #! /usr/bin/env perl
2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. Rights for redistribution and usage in source and binary
13 # forms are granted according to the License.
14 # ====================================================================
15 #
16 # Version 1.1
17 #
18 # The major reason for undertaken effort was to mitigate the hazard of
19 # cache-timing attack. This is [currently and initially!] addressed in
20 # two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
21 # 2. References to them are scheduled for L2 cache latency, meaning
22 # that the tables don't have to reside in L1 cache. Once again, this
23 # is an initial draft and one should expect more countermeasures to
24 # be implemented...
25 #
26 # Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
27 # round.
28 #
29 # Even though performance was not the primary goal [on the contrary,
30 # extra shifts "induced" by compressed S-box and longer loop epilogue
31 # "induced" by scheduling for L2 have negative effect on performance],
32 # the code turned out to run in ~23 cycles per processed byte en-/
33 # decrypted with 128-bit key. This is pretty good result for code
34 # with mentioned qualities and UltraSPARC core. Compared to Sun C
35 # generated code my encrypt procedure runs just few percents faster,
36 # while decrypt one - whole 50% faster [yes, Sun C failed to generate
37 # optimal decrypt procedure]. Compared to GNU C generated code both
38 # procedures are more than 60% faster:-)
39
40 $output = pop and open STDOUT,">$output";
41
42 $frame="STACK_FRAME";
43 $bias="STACK_BIAS";
44 $locals=16;
45
46 $acc0="%l0";
47 $acc1="%o0";
48 $acc2="%o1";
49 $acc3="%o2";
50
51 $acc4="%l1";
52 $acc5="%o3";
53 $acc6="%o4";
54 $acc7="%o5";
55
56 $acc8="%l2";
57 $acc9="%o7";
58 $acc10="%g1";
59 $acc11="%g2";
60
61 $acc12="%l3";
62 $acc13="%g3";
63 $acc14="%g4";
64 $acc15="%g5";
65
66 $t0="%l4";
67 $t1="%l5";
68 $t2="%l6";
69 $t3="%l7";
70
71 $s0="%i0";
72 $s1="%i1";
73 $s2="%i2";
74 $s3="%i3";
75 $tbl="%i4";
76 $key="%i5";
77 $rounds="%i7";  # aliases with return address, which is off-loaded to stack
78
79 sub _data_word()
80 { my $i;
81     while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
82 }
83
84 $code.=<<___;
85 #include "sparc_arch.h"
86
87 #ifdef  __arch64__
88 .register       %g2,#scratch
89 .register       %g3,#scratch
90 #endif
91 .section        ".text",#alloc,#execinstr
92
93 .align  256
94 AES_Te:
95 ___
96 &_data_word(
97         0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
98         0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
99         0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
100         0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
101         0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
102         0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
103         0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
104         0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
105         0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
106         0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
107         0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
108         0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
109         0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
110         0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
111         0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
112         0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
113         0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
114         0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
115         0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
116         0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
117         0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
118         0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
119         0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
120         0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
121         0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
122         0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
123         0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
124         0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
125         0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
126         0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
127         0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
128         0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
129         0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
130         0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
131         0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
132         0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
133         0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
134         0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
135         0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
136         0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
137         0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
138         0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
139         0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
140         0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
141         0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
142         0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
143         0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
144         0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
145         0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
146         0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
147         0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
148         0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
149         0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
150         0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
151         0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
152         0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
153         0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
154         0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
155         0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
156         0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
157         0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
158         0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
159         0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
160         0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
161 $code.=<<___;
162         .byte   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
163         .byte   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
164         .byte   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
165         .byte   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
166         .byte   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
167         .byte   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
168         .byte   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
169         .byte   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
170         .byte   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
171         .byte   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
172         .byte   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
173         .byte   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
174         .byte   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
175         .byte   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
176         .byte   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
177         .byte   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
178         .byte   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
179         .byte   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
180         .byte   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
181         .byte   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
182         .byte   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
183         .byte   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
184         .byte   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
185         .byte   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
186         .byte   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
187         .byte   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
188         .byte   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
189         .byte   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
190         .byte   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
191         .byte   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
192         .byte   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
193         .byte   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
194 .type   AES_Te,#object
195 .size   AES_Te,(.-AES_Te)
196
197 .align  64
198 .skip   16
199 _sparcv9_AES_encrypt:
200         save    %sp,-$frame-$locals,%sp
201         stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
202         ld      [$key+240],$rounds
203         ld      [$key+0],$t0
204         ld      [$key+4],$t1                    !
205         ld      [$key+8],$t2
206         srl     $rounds,1,$rounds
207         xor     $t0,$s0,$s0
208         ld      [$key+12],$t3
209         srl     $s0,21,$acc0
210         xor     $t1,$s1,$s1
211         ld      [$key+16],$t0
212         srl     $s1,13,$acc1                    !
213         xor     $t2,$s2,$s2
214         ld      [$key+20],$t1
215         xor     $t3,$s3,$s3
216         ld      [$key+24],$t2
217         and     $acc0,2040,$acc0
218         ld      [$key+28],$t3
219         nop
220 .Lenc_loop:
221         srl     $s2,5,$acc2                     !
222         and     $acc1,2040,$acc1
223         ldx     [$tbl+$acc0],$acc0
224         sll     $s3,3,$acc3
225         and     $acc2,2040,$acc2
226         ldx     [$tbl+$acc1],$acc1
227         srl     $s1,21,$acc4
228         and     $acc3,2040,$acc3
229         ldx     [$tbl+$acc2],$acc2              !
230         srl     $s2,13,$acc5
231         and     $acc4,2040,$acc4
232         ldx     [$tbl+$acc3],$acc3
233         srl     $s3,5,$acc6
234         and     $acc5,2040,$acc5
235         ldx     [$tbl+$acc4],$acc4
236         fmovs   %f0,%f0
237         sll     $s0,3,$acc7                     !
238         and     $acc6,2040,$acc6
239         ldx     [$tbl+$acc5],$acc5
240         srl     $s2,21,$acc8
241         and     $acc7,2040,$acc7
242         ldx     [$tbl+$acc6],$acc6
243         srl     $s3,13,$acc9
244         and     $acc8,2040,$acc8
245         ldx     [$tbl+$acc7],$acc7              !
246         srl     $s0,5,$acc10
247         and     $acc9,2040,$acc9
248         ldx     [$tbl+$acc8],$acc8
249         sll     $s1,3,$acc11
250         and     $acc10,2040,$acc10
251         ldx     [$tbl+$acc9],$acc9
252         fmovs   %f0,%f0
253         srl     $s3,21,$acc12                   !
254         and     $acc11,2040,$acc11
255         ldx     [$tbl+$acc10],$acc10
256         srl     $s0,13,$acc13
257         and     $acc12,2040,$acc12
258         ldx     [$tbl+$acc11],$acc11
259         srl     $s1,5,$acc14
260         and     $acc13,2040,$acc13
261         ldx     [$tbl+$acc12],$acc12            !
262         sll     $s2,3,$acc15
263         and     $acc14,2040,$acc14
264         ldx     [$tbl+$acc13],$acc13
265         and     $acc15,2040,$acc15
266         add     $key,32,$key
267         ldx     [$tbl+$acc14],$acc14
268         fmovs   %f0,%f0
269         subcc   $rounds,1,$rounds               !
270         ldx     [$tbl+$acc15],$acc15
271         bz,a,pn %icc,.Lenc_last
272         add     $tbl,2048,$rounds
273
274                 srlx    $acc1,8,$acc1
275                 xor     $acc0,$t0,$t0
276         ld      [$key+0],$s0
277         fmovs   %f0,%f0
278                 srlx    $acc2,16,$acc2          !
279                 xor     $acc1,$t0,$t0
280         ld      [$key+4],$s1
281                 srlx    $acc3,24,$acc3
282                 xor     $acc2,$t0,$t0
283         ld      [$key+8],$s2
284                 srlx    $acc5,8,$acc5
285                 xor     $acc3,$t0,$t0
286         ld      [$key+12],$s3                   !
287                 srlx    $acc6,16,$acc6
288                 xor     $acc4,$t1,$t1
289         fmovs   %f0,%f0
290                 srlx    $acc7,24,$acc7
291                 xor     $acc5,$t1,$t1
292                 srlx    $acc9,8,$acc9
293                 xor     $acc6,$t1,$t1
294                 srlx    $acc10,16,$acc10        !
295                 xor     $acc7,$t1,$t1
296                 srlx    $acc11,24,$acc11
297                 xor     $acc8,$t2,$t2
298                 srlx    $acc13,8,$acc13
299                 xor     $acc9,$t2,$t2
300                 srlx    $acc14,16,$acc14
301                 xor     $acc10,$t2,$t2
302                 srlx    $acc15,24,$acc15        !
303                 xor     $acc11,$t2,$t2
304                 xor     $acc12,$acc14,$acc14
305                 xor     $acc13,$t3,$t3
306         srl     $t0,21,$acc0
307                 xor     $acc14,$t3,$t3
308         srl     $t1,13,$acc1
309                 xor     $acc15,$t3,$t3
310
311         and     $acc0,2040,$acc0                !
312         srl     $t2,5,$acc2
313         and     $acc1,2040,$acc1
314         ldx     [$tbl+$acc0],$acc0
315         sll     $t3,3,$acc3
316         and     $acc2,2040,$acc2
317         ldx     [$tbl+$acc1],$acc1
318         fmovs   %f0,%f0
319         srl     $t1,21,$acc4                    !
320         and     $acc3,2040,$acc3
321         ldx     [$tbl+$acc2],$acc2
322         srl     $t2,13,$acc5
323         and     $acc4,2040,$acc4
324         ldx     [$tbl+$acc3],$acc3
325         srl     $t3,5,$acc6
326         and     $acc5,2040,$acc5
327         ldx     [$tbl+$acc4],$acc4              !
328         sll     $t0,3,$acc7
329         and     $acc6,2040,$acc6
330         ldx     [$tbl+$acc5],$acc5
331         srl     $t2,21,$acc8
332         and     $acc7,2040,$acc7
333         ldx     [$tbl+$acc6],$acc6
334         fmovs   %f0,%f0
335         srl     $t3,13,$acc9                    !
336         and     $acc8,2040,$acc8
337         ldx     [$tbl+$acc7],$acc7
338         srl     $t0,5,$acc10
339         and     $acc9,2040,$acc9
340         ldx     [$tbl+$acc8],$acc8
341         sll     $t1,3,$acc11
342         and     $acc10,2040,$acc10
343         ldx     [$tbl+$acc9],$acc9              !
344         srl     $t3,21,$acc12
345         and     $acc11,2040,$acc11
346         ldx     [$tbl+$acc10],$acc10
347         srl     $t0,13,$acc13
348         and     $acc12,2040,$acc12
349         ldx     [$tbl+$acc11],$acc11
350         fmovs   %f0,%f0
351         srl     $t1,5,$acc14                    !
352         and     $acc13,2040,$acc13
353         ldx     [$tbl+$acc12],$acc12
354         sll     $t2,3,$acc15
355         and     $acc14,2040,$acc14
356         ldx     [$tbl+$acc13],$acc13
357                 srlx    $acc1,8,$acc1
358         and     $acc15,2040,$acc15
359         ldx     [$tbl+$acc14],$acc14            !
360
361                 srlx    $acc2,16,$acc2
362                 xor     $acc0,$s0,$s0
363         ldx     [$tbl+$acc15],$acc15
364                 srlx    $acc3,24,$acc3
365                 xor     $acc1,$s0,$s0
366         ld      [$key+16],$t0
367         fmovs   %f0,%f0
368                 srlx    $acc5,8,$acc5           !
369                 xor     $acc2,$s0,$s0
370         ld      [$key+20],$t1
371                 srlx    $acc6,16,$acc6
372                 xor     $acc3,$s0,$s0
373         ld      [$key+24],$t2
374                 srlx    $acc7,24,$acc7
375                 xor     $acc4,$s1,$s1
376         ld      [$key+28],$t3                   !
377                 srlx    $acc9,8,$acc9
378                 xor     $acc5,$s1,$s1
379         ldx     [$tbl+2048+0],%g0               ! prefetch te4
380                 srlx    $acc10,16,$acc10
381                 xor     $acc6,$s1,$s1
382         ldx     [$tbl+2048+32],%g0              ! prefetch te4
383                 srlx    $acc11,24,$acc11
384                 xor     $acc7,$s1,$s1
385         ldx     [$tbl+2048+64],%g0              ! prefetch te4
386                 srlx    $acc13,8,$acc13
387                 xor     $acc8,$s2,$s2
388         ldx     [$tbl+2048+96],%g0              ! prefetch te4
389                 srlx    $acc14,16,$acc14        !
390                 xor     $acc9,$s2,$s2
391         ldx     [$tbl+2048+128],%g0             ! prefetch te4
392                 srlx    $acc15,24,$acc15
393                 xor     $acc10,$s2,$s2
394         ldx     [$tbl+2048+160],%g0             ! prefetch te4
395         srl     $s0,21,$acc0
396                 xor     $acc11,$s2,$s2
397         ldx     [$tbl+2048+192],%g0             ! prefetch te4
398                 xor     $acc12,$acc14,$acc14
399                 xor     $acc13,$s3,$s3
400         ldx     [$tbl+2048+224],%g0             ! prefetch te4
401         srl     $s1,13,$acc1                    !
402                 xor     $acc14,$s3,$s3
403                 xor     $acc15,$s3,$s3
404         ba      .Lenc_loop
405         and     $acc0,2040,$acc0
406
407 .align  32
408 .Lenc_last:
409                 srlx    $acc1,8,$acc1           !
410                 xor     $acc0,$t0,$t0
411         ld      [$key+0],$s0
412                 srlx    $acc2,16,$acc2
413                 xor     $acc1,$t0,$t0
414         ld      [$key+4],$s1
415                 srlx    $acc3,24,$acc3
416                 xor     $acc2,$t0,$t0
417         ld      [$key+8],$s2                    !
418                 srlx    $acc5,8,$acc5
419                 xor     $acc3,$t0,$t0
420         ld      [$key+12],$s3
421                 srlx    $acc6,16,$acc6
422                 xor     $acc4,$t1,$t1
423                 srlx    $acc7,24,$acc7
424                 xor     $acc5,$t1,$t1
425                 srlx    $acc9,8,$acc9           !
426                 xor     $acc6,$t1,$t1
427                 srlx    $acc10,16,$acc10
428                 xor     $acc7,$t1,$t1
429                 srlx    $acc11,24,$acc11
430                 xor     $acc8,$t2,$t2
431                 srlx    $acc13,8,$acc13
432                 xor     $acc9,$t2,$t2
433                 srlx    $acc14,16,$acc14        !
434                 xor     $acc10,$t2,$t2
435                 srlx    $acc15,24,$acc15
436                 xor     $acc11,$t2,$t2
437                 xor     $acc12,$acc14,$acc14
438                 xor     $acc13,$t3,$t3
439         srl     $t0,24,$acc0
440                 xor     $acc14,$t3,$t3
441         srl     $t1,16,$acc1                    !
442                 xor     $acc15,$t3,$t3
443
444         srl     $t2,8,$acc2
445         and     $acc1,255,$acc1
446         ldub    [$rounds+$acc0],$acc0
447         srl     $t1,24,$acc4
448         and     $acc2,255,$acc2
449         ldub    [$rounds+$acc1],$acc1
450         srl     $t2,16,$acc5                    !
451         and     $t3,255,$acc3
452         ldub    [$rounds+$acc2],$acc2
453         ldub    [$rounds+$acc3],$acc3
454         srl     $t3,8,$acc6
455         and     $acc5,255,$acc5
456         ldub    [$rounds+$acc4],$acc4
457         fmovs   %f0,%f0
458         srl     $t2,24,$acc8                    !
459         and     $acc6,255,$acc6
460         ldub    [$rounds+$acc5],$acc5
461         srl     $t3,16,$acc9
462         and     $t0,255,$acc7
463         ldub    [$rounds+$acc6],$acc6
464         ldub    [$rounds+$acc7],$acc7
465         fmovs   %f0,%f0
466         srl     $t0,8,$acc10                    !
467         and     $acc9,255,$acc9
468         ldub    [$rounds+$acc8],$acc8
469         srl     $t3,24,$acc12
470         and     $acc10,255,$acc10
471         ldub    [$rounds+$acc9],$acc9
472         srl     $t0,16,$acc13
473         and     $t1,255,$acc11
474         ldub    [$rounds+$acc10],$acc10         !
475         srl     $t1,8,$acc14
476         and     $acc13,255,$acc13
477         ldub    [$rounds+$acc11],$acc11
478         ldub    [$rounds+$acc12],$acc12
479         and     $acc14,255,$acc14
480         ldub    [$rounds+$acc13],$acc13
481         and     $t2,255,$acc15
482         ldub    [$rounds+$acc14],$acc14         !
483
484                 sll     $acc0,24,$acc0
485                 xor     $acc3,$s0,$s0
486         ldub    [$rounds+$acc15],$acc15
487                 sll     $acc1,16,$acc1
488                 xor     $acc0,$s0,$s0
489         ldx     [%sp+$bias+$frame+0],%i7        ! restore return address
490         fmovs   %f0,%f0
491                 sll     $acc2,8,$acc2           !
492                 xor     $acc1,$s0,$s0
493                 sll     $acc4,24,$acc4
494                 xor     $acc2,$s0,$s0
495                 sll     $acc5,16,$acc5
496                 xor     $acc7,$s1,$s1
497                 sll     $acc6,8,$acc6
498                 xor     $acc4,$s1,$s1
499                 sll     $acc8,24,$acc8          !
500                 xor     $acc5,$s1,$s1
501                 sll     $acc9,16,$acc9
502                 xor     $acc11,$s2,$s2
503                 sll     $acc10,8,$acc10
504                 xor     $acc6,$s1,$s1
505                 sll     $acc12,24,$acc12
506                 xor     $acc8,$s2,$s2
507                 sll     $acc13,16,$acc13        !
508                 xor     $acc9,$s2,$s2
509                 sll     $acc14,8,$acc14
510                 xor     $acc10,$s2,$s2
511                 xor     $acc12,$acc14,$acc14
512                 xor     $acc13,$s3,$s3
513                 xor     $acc14,$s3,$s3
514                 xor     $acc15,$s3,$s3
515
516         ret
517         restore
518 .type   _sparcv9_AES_encrypt,#function
519 .size   _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
520
521 .align  32
522 .globl  AES_encrypt
523 AES_encrypt:
524         or      %o0,%o1,%g1
525         andcc   %g1,3,%g0
526         bnz,pn  %xcc,.Lunaligned_enc
527         save    %sp,-$frame,%sp
528
529         ld      [%i0+0],%o0
530         ld      [%i0+4],%o1
531         ld      [%i0+8],%o2
532         ld      [%i0+12],%o3
533
534 1:      call    .+8
535         add     %o7,AES_Te-1b,%o4
536         call    _sparcv9_AES_encrypt
537         mov     %i2,%o5
538
539         st      %o0,[%i1+0]
540         st      %o1,[%i1+4]
541         st      %o2,[%i1+8]
542         st      %o3,[%i1+12]
543
544         ret
545         restore
546
547 .align  32
548 .Lunaligned_enc:
549         ldub    [%i0+0],%l0
550         ldub    [%i0+1],%l1
551         ldub    [%i0+2],%l2
552
553         sll     %l0,24,%l0
554         ldub    [%i0+3],%l3
555         sll     %l1,16,%l1
556         ldub    [%i0+4],%l4
557         sll     %l2,8,%l2
558         or      %l1,%l0,%l0
559         ldub    [%i0+5],%l5
560         sll     %l4,24,%l4
561         or      %l3,%l2,%l2
562         ldub    [%i0+6],%l6
563         sll     %l5,16,%l5
564         or      %l0,%l2,%o0
565         ldub    [%i0+7],%l7
566
567         sll     %l6,8,%l6
568         or      %l5,%l4,%l4
569         ldub    [%i0+8],%l0
570         or      %l7,%l6,%l6
571         ldub    [%i0+9],%l1
572         or      %l4,%l6,%o1
573         ldub    [%i0+10],%l2
574
575         sll     %l0,24,%l0
576         ldub    [%i0+11],%l3
577         sll     %l1,16,%l1
578         ldub    [%i0+12],%l4
579         sll     %l2,8,%l2
580         or      %l1,%l0,%l0
581         ldub    [%i0+13],%l5
582         sll     %l4,24,%l4
583         or      %l3,%l2,%l2
584         ldub    [%i0+14],%l6
585         sll     %l5,16,%l5
586         or      %l0,%l2,%o2
587         ldub    [%i0+15],%l7
588
589         sll     %l6,8,%l6
590         or      %l5,%l4,%l4
591         or      %l7,%l6,%l6
592         or      %l4,%l6,%o3
593
594 1:      call    .+8
595         add     %o7,AES_Te-1b,%o4
596         call    _sparcv9_AES_encrypt
597         mov     %i2,%o5
598
599         srl     %o0,24,%l0
600         srl     %o0,16,%l1
601         stb     %l0,[%i1+0]
602         srl     %o0,8,%l2
603         stb     %l1,[%i1+1]
604         stb     %l2,[%i1+2]
605         srl     %o1,24,%l4
606         stb     %o0,[%i1+3]
607
608         srl     %o1,16,%l5
609         stb     %l4,[%i1+4]
610         srl     %o1,8,%l6
611         stb     %l5,[%i1+5]
612         stb     %l6,[%i1+6]
613         srl     %o2,24,%l0
614         stb     %o1,[%i1+7]
615
616         srl     %o2,16,%l1
617         stb     %l0,[%i1+8]
618         srl     %o2,8,%l2
619         stb     %l1,[%i1+9]
620         stb     %l2,[%i1+10]
621         srl     %o3,24,%l4
622         stb     %o2,[%i1+11]
623
624         srl     %o3,16,%l5
625         stb     %l4,[%i1+12]
626         srl     %o3,8,%l6
627         stb     %l5,[%i1+13]
628         stb     %l6,[%i1+14]
629         stb     %o3,[%i1+15]
630
631         ret
632         restore
633 .type   AES_encrypt,#function
634 .size   AES_encrypt,(.-AES_encrypt)
635
636 ___
637
638 $code.=<<___;
639 .align  256
640 AES_Td:
641 ___
642 &_data_word(
643         0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
644         0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
645         0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
646         0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
647         0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
648         0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
649         0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
650         0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
651         0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
652         0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
653         0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
654         0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
655         0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
656         0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
657         0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
658         0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
659         0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
660         0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
661         0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
662         0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
663         0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
664         0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
665         0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
666         0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
667         0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
668         0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
669         0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
670         0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
671         0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
672         0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
673         0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
674         0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
675         0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
676         0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
677         0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
678         0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
679         0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
680         0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
681         0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
682         0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
683         0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
684         0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
685         0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
686         0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
687         0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
688         0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
689         0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
690         0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
691         0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
692         0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
693         0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
694         0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
695         0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
696         0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
697         0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
698         0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
699         0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
700         0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
701         0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
702         0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
703         0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
704         0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
705         0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
706         0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
707 $code.=<<___;
708         .byte   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
709         .byte   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
710         .byte   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
711         .byte   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
712         .byte   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
713         .byte   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
714         .byte   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
715         .byte   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
716         .byte   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
717         .byte   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
718         .byte   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
719         .byte   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
720         .byte   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
721         .byte   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
722         .byte   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
723         .byte   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
724         .byte   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
725         .byte   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
726         .byte   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
727         .byte   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
728         .byte   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
729         .byte   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
730         .byte   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
731         .byte   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
732         .byte   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
733         .byte   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
734         .byte   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
735         .byte   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
736         .byte   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
737         .byte   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
738         .byte   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
739         .byte   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
740 .type   AES_Td,#object
741 .size   AES_Td,(.-AES_Td)
742
743 .align  64
744 .skip   16
745 _sparcv9_AES_decrypt:
746         save    %sp,-$frame-$locals,%sp
747         stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
748         ld      [$key+240],$rounds
749         ld      [$key+0],$t0
750         ld      [$key+4],$t1                    !
751         ld      [$key+8],$t2
752         ld      [$key+12],$t3
753         srl     $rounds,1,$rounds
754         xor     $t0,$s0,$s0
755         ld      [$key+16],$t0
756         xor     $t1,$s1,$s1
757         ld      [$key+20],$t1
758         srl     $s0,21,$acc0                    !
759         xor     $t2,$s2,$s2
760         ld      [$key+24],$t2
761         xor     $t3,$s3,$s3
762         and     $acc0,2040,$acc0
763         ld      [$key+28],$t3
764         srl     $s3,13,$acc1
765         nop
766 .Ldec_loop:
767         srl     $s2,5,$acc2                     !
768         and     $acc1,2040,$acc1
769         ldx     [$tbl+$acc0],$acc0
770         sll     $s1,3,$acc3
771         and     $acc2,2040,$acc2
772         ldx     [$tbl+$acc1],$acc1
773         srl     $s1,21,$acc4
774         and     $acc3,2040,$acc3
775         ldx     [$tbl+$acc2],$acc2              !
776         srl     $s0,13,$acc5
777         and     $acc4,2040,$acc4
778         ldx     [$tbl+$acc3],$acc3
779         srl     $s3,5,$acc6
780         and     $acc5,2040,$acc5
781         ldx     [$tbl+$acc4],$acc4
782         fmovs   %f0,%f0
783         sll     $s2,3,$acc7                     !
784         and     $acc6,2040,$acc6
785         ldx     [$tbl+$acc5],$acc5
786         srl     $s2,21,$acc8
787         and     $acc7,2040,$acc7
788         ldx     [$tbl+$acc6],$acc6
789         srl     $s1,13,$acc9
790         and     $acc8,2040,$acc8
791         ldx     [$tbl+$acc7],$acc7              !
792         srl     $s0,5,$acc10
793         and     $acc9,2040,$acc9
794         ldx     [$tbl+$acc8],$acc8
795         sll     $s3,3,$acc11
796         and     $acc10,2040,$acc10
797         ldx     [$tbl+$acc9],$acc9
798         fmovs   %f0,%f0
799         srl     $s3,21,$acc12                   !
800         and     $acc11,2040,$acc11
801         ldx     [$tbl+$acc10],$acc10
802         srl     $s2,13,$acc13
803         and     $acc12,2040,$acc12
804         ldx     [$tbl+$acc11],$acc11
805         srl     $s1,5,$acc14
806         and     $acc13,2040,$acc13
807         ldx     [$tbl+$acc12],$acc12            !
808         sll     $s0,3,$acc15
809         and     $acc14,2040,$acc14
810         ldx     [$tbl+$acc13],$acc13
811         and     $acc15,2040,$acc15
812         add     $key,32,$key
813         ldx     [$tbl+$acc14],$acc14
814         fmovs   %f0,%f0
815         subcc   $rounds,1,$rounds               !
816         ldx     [$tbl+$acc15],$acc15
817         bz,a,pn %icc,.Ldec_last
818         add     $tbl,2048,$rounds
819
820                 srlx    $acc1,8,$acc1
821                 xor     $acc0,$t0,$t0
822         ld      [$key+0],$s0
823         fmovs   %f0,%f0
824                 srlx    $acc2,16,$acc2          !
825                 xor     $acc1,$t0,$t0
826         ld      [$key+4],$s1
827                 srlx    $acc3,24,$acc3
828                 xor     $acc2,$t0,$t0
829         ld      [$key+8],$s2
830                 srlx    $acc5,8,$acc5
831                 xor     $acc3,$t0,$t0
832         ld      [$key+12],$s3                   !
833                 srlx    $acc6,16,$acc6
834                 xor     $acc4,$t1,$t1
835         fmovs   %f0,%f0
836                 srlx    $acc7,24,$acc7
837                 xor     $acc5,$t1,$t1
838                 srlx    $acc9,8,$acc9
839                 xor     $acc6,$t1,$t1
840                 srlx    $acc10,16,$acc10        !
841                 xor     $acc7,$t1,$t1
842                 srlx    $acc11,24,$acc11
843                 xor     $acc8,$t2,$t2
844                 srlx    $acc13,8,$acc13
845                 xor     $acc9,$t2,$t2
846                 srlx    $acc14,16,$acc14
847                 xor     $acc10,$t2,$t2
848                 srlx    $acc15,24,$acc15        !
849                 xor     $acc11,$t2,$t2
850                 xor     $acc12,$acc14,$acc14
851                 xor     $acc13,$t3,$t3
852         srl     $t0,21,$acc0
853                 xor     $acc14,$t3,$t3
854                 xor     $acc15,$t3,$t3
855         srl     $t3,13,$acc1
856
857         and     $acc0,2040,$acc0                !
858         srl     $t2,5,$acc2
859         and     $acc1,2040,$acc1
860         ldx     [$tbl+$acc0],$acc0
861         sll     $t1,3,$acc3
862         and     $acc2,2040,$acc2
863         ldx     [$tbl+$acc1],$acc1
864         fmovs   %f0,%f0
865         srl     $t1,21,$acc4                    !
866         and     $acc3,2040,$acc3
867         ldx     [$tbl+$acc2],$acc2
868         srl     $t0,13,$acc5
869         and     $acc4,2040,$acc4
870         ldx     [$tbl+$acc3],$acc3
871         srl     $t3,5,$acc6
872         and     $acc5,2040,$acc5
873         ldx     [$tbl+$acc4],$acc4              !
874         sll     $t2,3,$acc7
875         and     $acc6,2040,$acc6
876         ldx     [$tbl+$acc5],$acc5
877         srl     $t2,21,$acc8
878         and     $acc7,2040,$acc7
879         ldx     [$tbl+$acc6],$acc6
880         fmovs   %f0,%f0
881         srl     $t1,13,$acc9                    !
882         and     $acc8,2040,$acc8
883         ldx     [$tbl+$acc7],$acc7
884         srl     $t0,5,$acc10
885         and     $acc9,2040,$acc9
886         ldx     [$tbl+$acc8],$acc8
887         sll     $t3,3,$acc11
888         and     $acc10,2040,$acc10
889         ldx     [$tbl+$acc9],$acc9              !
890         srl     $t3,21,$acc12
891         and     $acc11,2040,$acc11
892         ldx     [$tbl+$acc10],$acc10
893         srl     $t2,13,$acc13
894         and     $acc12,2040,$acc12
895         ldx     [$tbl+$acc11],$acc11
896         fmovs   %f0,%f0
897         srl     $t1,5,$acc14                    !
898         and     $acc13,2040,$acc13
899         ldx     [$tbl+$acc12],$acc12
900         sll     $t0,3,$acc15
901         and     $acc14,2040,$acc14
902         ldx     [$tbl+$acc13],$acc13
903                 srlx    $acc1,8,$acc1
904         and     $acc15,2040,$acc15
905         ldx     [$tbl+$acc14],$acc14            !
906
907                 srlx    $acc2,16,$acc2
908                 xor     $acc0,$s0,$s0
909         ldx     [$tbl+$acc15],$acc15
910                 srlx    $acc3,24,$acc3
911                 xor     $acc1,$s0,$s0
912         ld      [$key+16],$t0
913         fmovs   %f0,%f0
914                 srlx    $acc5,8,$acc5           !
915                 xor     $acc2,$s0,$s0
916         ld      [$key+20],$t1
917                 srlx    $acc6,16,$acc6
918                 xor     $acc3,$s0,$s0
919         ld      [$key+24],$t2
920                 srlx    $acc7,24,$acc7
921                 xor     $acc4,$s1,$s1
922         ld      [$key+28],$t3                   !
923                 srlx    $acc9,8,$acc9
924                 xor     $acc5,$s1,$s1
925         ldx     [$tbl+2048+0],%g0               ! prefetch td4
926                 srlx    $acc10,16,$acc10
927                 xor     $acc6,$s1,$s1
928         ldx     [$tbl+2048+32],%g0              ! prefetch td4
929                 srlx    $acc11,24,$acc11
930                 xor     $acc7,$s1,$s1
931         ldx     [$tbl+2048+64],%g0              ! prefetch td4
932                 srlx    $acc13,8,$acc13
933                 xor     $acc8,$s2,$s2
934         ldx     [$tbl+2048+96],%g0              ! prefetch td4
935                 srlx    $acc14,16,$acc14        !
936                 xor     $acc9,$s2,$s2
937         ldx     [$tbl+2048+128],%g0             ! prefetch td4
938                 srlx    $acc15,24,$acc15
939                 xor     $acc10,$s2,$s2
940         ldx     [$tbl+2048+160],%g0             ! prefetch td4
941         srl     $s0,21,$acc0
942                 xor     $acc11,$s2,$s2
943         ldx     [$tbl+2048+192],%g0             ! prefetch td4
944                 xor     $acc12,$acc14,$acc14
945                 xor     $acc13,$s3,$s3
946         ldx     [$tbl+2048+224],%g0             ! prefetch td4
947         and     $acc0,2040,$acc0                !
948                 xor     $acc14,$s3,$s3
949                 xor     $acc15,$s3,$s3
950         ba      .Ldec_loop
951         srl     $s3,13,$acc1
952
953 .align  32
954 .Ldec_last:
955                 srlx    $acc1,8,$acc1           !
956                 xor     $acc0,$t0,$t0
957         ld      [$key+0],$s0
958                 srlx    $acc2,16,$acc2
959                 xor     $acc1,$t0,$t0
960         ld      [$key+4],$s1
961                 srlx    $acc3,24,$acc3
962                 xor     $acc2,$t0,$t0
963         ld      [$key+8],$s2                    !
964                 srlx    $acc5,8,$acc5
965                 xor     $acc3,$t0,$t0
966         ld      [$key+12],$s3
967                 srlx    $acc6,16,$acc6
968                 xor     $acc4,$t1,$t1
969                 srlx    $acc7,24,$acc7
970                 xor     $acc5,$t1,$t1
971                 srlx    $acc9,8,$acc9           !
972                 xor     $acc6,$t1,$t1
973                 srlx    $acc10,16,$acc10
974                 xor     $acc7,$t1,$t1
975                 srlx    $acc11,24,$acc11
976                 xor     $acc8,$t2,$t2
977                 srlx    $acc13,8,$acc13
978                 xor     $acc9,$t2,$t2
979                 srlx    $acc14,16,$acc14        !
980                 xor     $acc10,$t2,$t2
981                 srlx    $acc15,24,$acc15
982                 xor     $acc11,$t2,$t2
983                 xor     $acc12,$acc14,$acc14
984                 xor     $acc13,$t3,$t3
985         srl     $t0,24,$acc0
986                 xor     $acc14,$t3,$t3
987                 xor     $acc15,$t3,$t3          !
988         srl     $t3,16,$acc1
989
990         srl     $t2,8,$acc2
991         and     $acc1,255,$acc1
992         ldub    [$rounds+$acc0],$acc0
993         srl     $t1,24,$acc4
994         and     $acc2,255,$acc2
995         ldub    [$rounds+$acc1],$acc1
996         srl     $t0,16,$acc5                    !
997         and     $t1,255,$acc3
998         ldub    [$rounds+$acc2],$acc2
999         ldub    [$rounds+$acc3],$acc3
1000         srl     $t3,8,$acc6
1001         and     $acc5,255,$acc5
1002         ldub    [$rounds+$acc4],$acc4
1003         fmovs   %f0,%f0
1004         srl     $t2,24,$acc8                    !
1005         and     $acc6,255,$acc6
1006         ldub    [$rounds+$acc5],$acc5
1007         srl     $t1,16,$acc9
1008         and     $t2,255,$acc7
1009         ldub    [$rounds+$acc6],$acc6
1010         ldub    [$rounds+$acc7],$acc7
1011         fmovs   %f0,%f0
1012         srl     $t0,8,$acc10                    !
1013         and     $acc9,255,$acc9
1014         ldub    [$rounds+$acc8],$acc8
1015         srl     $t3,24,$acc12
1016         and     $acc10,255,$acc10
1017         ldub    [$rounds+$acc9],$acc9
1018         srl     $t2,16,$acc13
1019         and     $t3,255,$acc11
1020         ldub    [$rounds+$acc10],$acc10         !
1021         srl     $t1,8,$acc14
1022         and     $acc13,255,$acc13
1023         ldub    [$rounds+$acc11],$acc11
1024         ldub    [$rounds+$acc12],$acc12
1025         and     $acc14,255,$acc14
1026         ldub    [$rounds+$acc13],$acc13
1027         and     $t0,255,$acc15
1028         ldub    [$rounds+$acc14],$acc14         !
1029
1030                 sll     $acc0,24,$acc0
1031                 xor     $acc3,$s0,$s0
1032         ldub    [$rounds+$acc15],$acc15
1033                 sll     $acc1,16,$acc1
1034                 xor     $acc0,$s0,$s0
1035         ldx     [%sp+$bias+$frame+0],%i7        ! restore return address
1036         fmovs   %f0,%f0
1037                 sll     $acc2,8,$acc2           !
1038                 xor     $acc1,$s0,$s0
1039                 sll     $acc4,24,$acc4
1040                 xor     $acc2,$s0,$s0
1041                 sll     $acc5,16,$acc5
1042                 xor     $acc7,$s1,$s1
1043                 sll     $acc6,8,$acc6
1044                 xor     $acc4,$s1,$s1
1045                 sll     $acc8,24,$acc8          !
1046                 xor     $acc5,$s1,$s1
1047                 sll     $acc9,16,$acc9
1048                 xor     $acc11,$s2,$s2
1049                 sll     $acc10,8,$acc10
1050                 xor     $acc6,$s1,$s1
1051                 sll     $acc12,24,$acc12
1052                 xor     $acc8,$s2,$s2
1053                 sll     $acc13,16,$acc13        !
1054                 xor     $acc9,$s2,$s2
1055                 sll     $acc14,8,$acc14
1056                 xor     $acc10,$s2,$s2
1057                 xor     $acc12,$acc14,$acc14
1058                 xor     $acc13,$s3,$s3
1059                 xor     $acc14,$s3,$s3
1060                 xor     $acc15,$s3,$s3
1061
1062         ret
1063         restore
1064 .type   _sparcv9_AES_decrypt,#function
1065 .size   _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1066
1067 .align  32
1068 .globl  AES_decrypt
1069 AES_decrypt:
1070         or      %o0,%o1,%g1
1071         andcc   %g1,3,%g0
1072         bnz,pn  %xcc,.Lunaligned_dec
1073         save    %sp,-$frame,%sp
1074
1075         ld      [%i0+0],%o0
1076         ld      [%i0+4],%o1
1077         ld      [%i0+8],%o2
1078         ld      [%i0+12],%o3
1079
1080 1:      call    .+8
1081         add     %o7,AES_Td-1b,%o4
1082         call    _sparcv9_AES_decrypt
1083         mov     %i2,%o5
1084
1085         st      %o0,[%i1+0]
1086         st      %o1,[%i1+4]
1087         st      %o2,[%i1+8]
1088         st      %o3,[%i1+12]
1089
1090         ret
1091         restore
1092
1093 .align  32
1094 .Lunaligned_dec:
1095         ldub    [%i0+0],%l0
1096         ldub    [%i0+1],%l1
1097         ldub    [%i0+2],%l2
1098
1099         sll     %l0,24,%l0
1100         ldub    [%i0+3],%l3
1101         sll     %l1,16,%l1
1102         ldub    [%i0+4],%l4
1103         sll     %l2,8,%l2
1104         or      %l1,%l0,%l0
1105         ldub    [%i0+5],%l5
1106         sll     %l4,24,%l4
1107         or      %l3,%l2,%l2
1108         ldub    [%i0+6],%l6
1109         sll     %l5,16,%l5
1110         or      %l0,%l2,%o0
1111         ldub    [%i0+7],%l7
1112
1113         sll     %l6,8,%l6
1114         or      %l5,%l4,%l4
1115         ldub    [%i0+8],%l0
1116         or      %l7,%l6,%l6
1117         ldub    [%i0+9],%l1
1118         or      %l4,%l6,%o1
1119         ldub    [%i0+10],%l2
1120
1121         sll     %l0,24,%l0
1122         ldub    [%i0+11],%l3
1123         sll     %l1,16,%l1
1124         ldub    [%i0+12],%l4
1125         sll     %l2,8,%l2
1126         or      %l1,%l0,%l0
1127         ldub    [%i0+13],%l5
1128         sll     %l4,24,%l4
1129         or      %l3,%l2,%l2
1130         ldub    [%i0+14],%l6
1131         sll     %l5,16,%l5
1132         or      %l0,%l2,%o2
1133         ldub    [%i0+15],%l7
1134
1135         sll     %l6,8,%l6
1136         or      %l5,%l4,%l4
1137         or      %l7,%l6,%l6
1138         or      %l4,%l6,%o3
1139
1140 1:      call    .+8
1141         add     %o7,AES_Td-1b,%o4
1142         call    _sparcv9_AES_decrypt
1143         mov     %i2,%o5
1144
1145         srl     %o0,24,%l0
1146         srl     %o0,16,%l1
1147         stb     %l0,[%i1+0]
1148         srl     %o0,8,%l2
1149         stb     %l1,[%i1+1]
1150         stb     %l2,[%i1+2]
1151         srl     %o1,24,%l4
1152         stb     %o0,[%i1+3]
1153
1154         srl     %o1,16,%l5
1155         stb     %l4,[%i1+4]
1156         srl     %o1,8,%l6
1157         stb     %l5,[%i1+5]
1158         stb     %l6,[%i1+6]
1159         srl     %o2,24,%l0
1160         stb     %o1,[%i1+7]
1161
1162         srl     %o2,16,%l1
1163         stb     %l0,[%i1+8]
1164         srl     %o2,8,%l2
1165         stb     %l1,[%i1+9]
1166         stb     %l2,[%i1+10]
1167         srl     %o3,24,%l4
1168         stb     %o2,[%i1+11]
1169
1170         srl     %o3,16,%l5
1171         stb     %l4,[%i1+12]
1172         srl     %o3,8,%l6
1173         stb     %l5,[%i1+13]
1174         stb     %l6,[%i1+14]
1175         stb     %o3,[%i1+15]
1176
1177         ret
1178         restore
1179 .type   AES_decrypt,#function
1180 .size   AES_decrypt,(.-AES_decrypt)
1181 ___
1182
1183 # fmovs instructions substituting for FP nops were originally added
1184 # to meet specific instruction alignment requirements to maximize ILP.
1185 # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1186 # undesired effect, so just omit them and sacrifice some portion of
1187 # percent in performance...
1188 $code =~ s/fmovs.*$//gm;
1189
1190 print $code;
1191 close STDOUT or die "error closing STDOUT";     # ensure flush