Split bignum code out of the sparcv9cap.c
[openssl.git] / crypto / aes / asm / aes-sparcv9.pl
1 #! /usr/bin/env perl
2 # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. Rights for redistribution and usage in source and binary
13 # forms are granted according to the License.
14 # ====================================================================
15 #
16 # Version 1.1
17 #
18 # The major reason for undertaken effort was to mitigate the hazard of
19 # cache-timing attack. This is [currently and initially!] addressed in
20 # two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
21 # 2. References to them are scheduled for L2 cache latency, meaning
22 # that the tables don't have to reside in L1 cache. Once again, this
23 # is an initial draft and one should expect more countermeasures to
24 # be implemented...
25 #
26 # Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
27 # round.
28 #
29 # Even though performance was not the primary goal [on the contrary,
30 # extra shifts "induced" by compressed S-box and longer loop epilogue
31 # "induced" by scheduling for L2 have negative effect on performance],
32 # the code turned out to run in ~23 cycles per processed byte en-/
33 # decrypted with 128-bit key. This is pretty good result for code
34 # with mentioned qualities and UltraSPARC core. Compared to Sun C
35 # generated code my encrypt procedure runs just few percents faster,
36 # while decrypt one - whole 50% faster [yes, Sun C failed to generate
37 # optimal decrypt procedure]. Compared to GNU C generated code both
38 # procedures are more than 60% faster:-)
39
40 $output = pop and open STDOUT,">$output";
41
42 $frame="STACK_FRAME";
43 $bias="STACK_BIAS";
44 $locals=16;
45
46 $acc0="%l0";
47 $acc1="%o0";
48 $acc2="%o1";
49 $acc3="%o2";
50
51 $acc4="%l1";
52 $acc5="%o3";
53 $acc6="%o4";
54 $acc7="%o5";
55
56 $acc8="%l2";
57 $acc9="%o7";
58 $acc10="%g1";
59 $acc11="%g2";
60
61 $acc12="%l3";
62 $acc13="%g3";
63 $acc14="%g4";
64 $acc15="%g5";
65
66 $t0="%l4";
67 $t1="%l5";
68 $t2="%l6";
69 $t3="%l7";
70
71 $s0="%i0";
72 $s1="%i1";
73 $s2="%i2";
74 $s3="%i3";
75 $tbl="%i4";
76 $key="%i5";
77 $rounds="%i7";  # aliases with return address, which is off-loaded to stack
78
79 sub _data_word()
80 { my $i;
81     while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
82 }
83
84 $code.=<<___;
85 #ifndef __ASSEMBLER__
86 # define __ASSEMBLER__ 1
87 #endif
88 #include "crypto/sparc_arch.h"
89
90 #ifdef  __arch64__
91 .register       %g2,#scratch
92 .register       %g3,#scratch
93 #endif
94 .section        ".text",#alloc,#execinstr
95
96 .align  256
97 AES_Te:
98 ___
99 &_data_word(
100         0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
101         0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
102         0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
103         0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
104         0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
105         0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
106         0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
107         0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
108         0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
109         0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
110         0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
111         0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
112         0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
113         0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
114         0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
115         0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
116         0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
117         0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
118         0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
119         0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
120         0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
121         0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
122         0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
123         0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
124         0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
125         0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
126         0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
127         0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
128         0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
129         0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
130         0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
131         0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
132         0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
133         0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
134         0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
135         0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
136         0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
137         0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
138         0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
139         0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
140         0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
141         0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
142         0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
143         0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
144         0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
145         0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
146         0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
147         0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
148         0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
149         0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
150         0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
151         0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
152         0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
153         0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
154         0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
155         0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
156         0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
157         0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
158         0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
159         0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
160         0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
161         0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
162         0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
163         0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
164 $code.=<<___;
165         .byte   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
166         .byte   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
167         .byte   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
168         .byte   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
169         .byte   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
170         .byte   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
171         .byte   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
172         .byte   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
173         .byte   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
174         .byte   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
175         .byte   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
176         .byte   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
177         .byte   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
178         .byte   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
179         .byte   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
180         .byte   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
181         .byte   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
182         .byte   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
183         .byte   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
184         .byte   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
185         .byte   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
186         .byte   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
187         .byte   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
188         .byte   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
189         .byte   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
190         .byte   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
191         .byte   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
192         .byte   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
193         .byte   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
194         .byte   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
195         .byte   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
196         .byte   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
197 .type   AES_Te,#object
198 .size   AES_Te,(.-AES_Te)
199
200 .align  64
201 .skip   16
202 _sparcv9_AES_encrypt:
203         save    %sp,-$frame-$locals,%sp
204         stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
205         ld      [$key+240],$rounds
206         ld      [$key+0],$t0
207         ld      [$key+4],$t1                    !
208         ld      [$key+8],$t2
209         srl     $rounds,1,$rounds
210         xor     $t0,$s0,$s0
211         ld      [$key+12],$t3
212         srl     $s0,21,$acc0
213         xor     $t1,$s1,$s1
214         ld      [$key+16],$t0
215         srl     $s1,13,$acc1                    !
216         xor     $t2,$s2,$s2
217         ld      [$key+20],$t1
218         xor     $t3,$s3,$s3
219         ld      [$key+24],$t2
220         and     $acc0,2040,$acc0
221         ld      [$key+28],$t3
222         nop
223 .Lenc_loop:
224         srl     $s2,5,$acc2                     !
225         and     $acc1,2040,$acc1
226         ldx     [$tbl+$acc0],$acc0
227         sll     $s3,3,$acc3
228         and     $acc2,2040,$acc2
229         ldx     [$tbl+$acc1],$acc1
230         srl     $s1,21,$acc4
231         and     $acc3,2040,$acc3
232         ldx     [$tbl+$acc2],$acc2              !
233         srl     $s2,13,$acc5
234         and     $acc4,2040,$acc4
235         ldx     [$tbl+$acc3],$acc3
236         srl     $s3,5,$acc6
237         and     $acc5,2040,$acc5
238         ldx     [$tbl+$acc4],$acc4
239         fmovs   %f0,%f0
240         sll     $s0,3,$acc7                     !
241         and     $acc6,2040,$acc6
242         ldx     [$tbl+$acc5],$acc5
243         srl     $s2,21,$acc8
244         and     $acc7,2040,$acc7
245         ldx     [$tbl+$acc6],$acc6
246         srl     $s3,13,$acc9
247         and     $acc8,2040,$acc8
248         ldx     [$tbl+$acc7],$acc7              !
249         srl     $s0,5,$acc10
250         and     $acc9,2040,$acc9
251         ldx     [$tbl+$acc8],$acc8
252         sll     $s1,3,$acc11
253         and     $acc10,2040,$acc10
254         ldx     [$tbl+$acc9],$acc9
255         fmovs   %f0,%f0
256         srl     $s3,21,$acc12                   !
257         and     $acc11,2040,$acc11
258         ldx     [$tbl+$acc10],$acc10
259         srl     $s0,13,$acc13
260         and     $acc12,2040,$acc12
261         ldx     [$tbl+$acc11],$acc11
262         srl     $s1,5,$acc14
263         and     $acc13,2040,$acc13
264         ldx     [$tbl+$acc12],$acc12            !
265         sll     $s2,3,$acc15
266         and     $acc14,2040,$acc14
267         ldx     [$tbl+$acc13],$acc13
268         and     $acc15,2040,$acc15
269         add     $key,32,$key
270         ldx     [$tbl+$acc14],$acc14
271         fmovs   %f0,%f0
272         subcc   $rounds,1,$rounds               !
273         ldx     [$tbl+$acc15],$acc15
274         bz,a,pn %icc,.Lenc_last
275         add     $tbl,2048,$rounds
276
277                 srlx    $acc1,8,$acc1
278                 xor     $acc0,$t0,$t0
279         ld      [$key+0],$s0
280         fmovs   %f0,%f0
281                 srlx    $acc2,16,$acc2          !
282                 xor     $acc1,$t0,$t0
283         ld      [$key+4],$s1
284                 srlx    $acc3,24,$acc3
285                 xor     $acc2,$t0,$t0
286         ld      [$key+8],$s2
287                 srlx    $acc5,8,$acc5
288                 xor     $acc3,$t0,$t0
289         ld      [$key+12],$s3                   !
290                 srlx    $acc6,16,$acc6
291                 xor     $acc4,$t1,$t1
292         fmovs   %f0,%f0
293                 srlx    $acc7,24,$acc7
294                 xor     $acc5,$t1,$t1
295                 srlx    $acc9,8,$acc9
296                 xor     $acc6,$t1,$t1
297                 srlx    $acc10,16,$acc10        !
298                 xor     $acc7,$t1,$t1
299                 srlx    $acc11,24,$acc11
300                 xor     $acc8,$t2,$t2
301                 srlx    $acc13,8,$acc13
302                 xor     $acc9,$t2,$t2
303                 srlx    $acc14,16,$acc14
304                 xor     $acc10,$t2,$t2
305                 srlx    $acc15,24,$acc15        !
306                 xor     $acc11,$t2,$t2
307                 xor     $acc12,$acc14,$acc14
308                 xor     $acc13,$t3,$t3
309         srl     $t0,21,$acc0
310                 xor     $acc14,$t3,$t3
311         srl     $t1,13,$acc1
312                 xor     $acc15,$t3,$t3
313
314         and     $acc0,2040,$acc0                !
315         srl     $t2,5,$acc2
316         and     $acc1,2040,$acc1
317         ldx     [$tbl+$acc0],$acc0
318         sll     $t3,3,$acc3
319         and     $acc2,2040,$acc2
320         ldx     [$tbl+$acc1],$acc1
321         fmovs   %f0,%f0
322         srl     $t1,21,$acc4                    !
323         and     $acc3,2040,$acc3
324         ldx     [$tbl+$acc2],$acc2
325         srl     $t2,13,$acc5
326         and     $acc4,2040,$acc4
327         ldx     [$tbl+$acc3],$acc3
328         srl     $t3,5,$acc6
329         and     $acc5,2040,$acc5
330         ldx     [$tbl+$acc4],$acc4              !
331         sll     $t0,3,$acc7
332         and     $acc6,2040,$acc6
333         ldx     [$tbl+$acc5],$acc5
334         srl     $t2,21,$acc8
335         and     $acc7,2040,$acc7
336         ldx     [$tbl+$acc6],$acc6
337         fmovs   %f0,%f0
338         srl     $t3,13,$acc9                    !
339         and     $acc8,2040,$acc8
340         ldx     [$tbl+$acc7],$acc7
341         srl     $t0,5,$acc10
342         and     $acc9,2040,$acc9
343         ldx     [$tbl+$acc8],$acc8
344         sll     $t1,3,$acc11
345         and     $acc10,2040,$acc10
346         ldx     [$tbl+$acc9],$acc9              !
347         srl     $t3,21,$acc12
348         and     $acc11,2040,$acc11
349         ldx     [$tbl+$acc10],$acc10
350         srl     $t0,13,$acc13
351         and     $acc12,2040,$acc12
352         ldx     [$tbl+$acc11],$acc11
353         fmovs   %f0,%f0
354         srl     $t1,5,$acc14                    !
355         and     $acc13,2040,$acc13
356         ldx     [$tbl+$acc12],$acc12
357         sll     $t2,3,$acc15
358         and     $acc14,2040,$acc14
359         ldx     [$tbl+$acc13],$acc13
360                 srlx    $acc1,8,$acc1
361         and     $acc15,2040,$acc15
362         ldx     [$tbl+$acc14],$acc14            !
363
364                 srlx    $acc2,16,$acc2
365                 xor     $acc0,$s0,$s0
366         ldx     [$tbl+$acc15],$acc15
367                 srlx    $acc3,24,$acc3
368                 xor     $acc1,$s0,$s0
369         ld      [$key+16],$t0
370         fmovs   %f0,%f0
371                 srlx    $acc5,8,$acc5           !
372                 xor     $acc2,$s0,$s0
373         ld      [$key+20],$t1
374                 srlx    $acc6,16,$acc6
375                 xor     $acc3,$s0,$s0
376         ld      [$key+24],$t2
377                 srlx    $acc7,24,$acc7
378                 xor     $acc4,$s1,$s1
379         ld      [$key+28],$t3                   !
380                 srlx    $acc9,8,$acc9
381                 xor     $acc5,$s1,$s1
382         ldx     [$tbl+2048+0],%g0               ! prefetch te4
383                 srlx    $acc10,16,$acc10
384                 xor     $acc6,$s1,$s1
385         ldx     [$tbl+2048+32],%g0              ! prefetch te4
386                 srlx    $acc11,24,$acc11
387                 xor     $acc7,$s1,$s1
388         ldx     [$tbl+2048+64],%g0              ! prefetch te4
389                 srlx    $acc13,8,$acc13
390                 xor     $acc8,$s2,$s2
391         ldx     [$tbl+2048+96],%g0              ! prefetch te4
392                 srlx    $acc14,16,$acc14        !
393                 xor     $acc9,$s2,$s2
394         ldx     [$tbl+2048+128],%g0             ! prefetch te4
395                 srlx    $acc15,24,$acc15
396                 xor     $acc10,$s2,$s2
397         ldx     [$tbl+2048+160],%g0             ! prefetch te4
398         srl     $s0,21,$acc0
399                 xor     $acc11,$s2,$s2
400         ldx     [$tbl+2048+192],%g0             ! prefetch te4
401                 xor     $acc12,$acc14,$acc14
402                 xor     $acc13,$s3,$s3
403         ldx     [$tbl+2048+224],%g0             ! prefetch te4
404         srl     $s1,13,$acc1                    !
405                 xor     $acc14,$s3,$s3
406                 xor     $acc15,$s3,$s3
407         ba      .Lenc_loop
408         and     $acc0,2040,$acc0
409
410 .align  32
411 .Lenc_last:
412                 srlx    $acc1,8,$acc1           !
413                 xor     $acc0,$t0,$t0
414         ld      [$key+0],$s0
415                 srlx    $acc2,16,$acc2
416                 xor     $acc1,$t0,$t0
417         ld      [$key+4],$s1
418                 srlx    $acc3,24,$acc3
419                 xor     $acc2,$t0,$t0
420         ld      [$key+8],$s2                    !
421                 srlx    $acc5,8,$acc5
422                 xor     $acc3,$t0,$t0
423         ld      [$key+12],$s3
424                 srlx    $acc6,16,$acc6
425                 xor     $acc4,$t1,$t1
426                 srlx    $acc7,24,$acc7
427                 xor     $acc5,$t1,$t1
428                 srlx    $acc9,8,$acc9           !
429                 xor     $acc6,$t1,$t1
430                 srlx    $acc10,16,$acc10
431                 xor     $acc7,$t1,$t1
432                 srlx    $acc11,24,$acc11
433                 xor     $acc8,$t2,$t2
434                 srlx    $acc13,8,$acc13
435                 xor     $acc9,$t2,$t2
436                 srlx    $acc14,16,$acc14        !
437                 xor     $acc10,$t2,$t2
438                 srlx    $acc15,24,$acc15
439                 xor     $acc11,$t2,$t2
440                 xor     $acc12,$acc14,$acc14
441                 xor     $acc13,$t3,$t3
442         srl     $t0,24,$acc0
443                 xor     $acc14,$t3,$t3
444         srl     $t1,16,$acc1                    !
445                 xor     $acc15,$t3,$t3
446
447         srl     $t2,8,$acc2
448         and     $acc1,255,$acc1
449         ldub    [$rounds+$acc0],$acc0
450         srl     $t1,24,$acc4
451         and     $acc2,255,$acc2
452         ldub    [$rounds+$acc1],$acc1
453         srl     $t2,16,$acc5                    !
454         and     $t3,255,$acc3
455         ldub    [$rounds+$acc2],$acc2
456         ldub    [$rounds+$acc3],$acc3
457         srl     $t3,8,$acc6
458         and     $acc5,255,$acc5
459         ldub    [$rounds+$acc4],$acc4
460         fmovs   %f0,%f0
461         srl     $t2,24,$acc8                    !
462         and     $acc6,255,$acc6
463         ldub    [$rounds+$acc5],$acc5
464         srl     $t3,16,$acc9
465         and     $t0,255,$acc7
466         ldub    [$rounds+$acc6],$acc6
467         ldub    [$rounds+$acc7],$acc7
468         fmovs   %f0,%f0
469         srl     $t0,8,$acc10                    !
470         and     $acc9,255,$acc9
471         ldub    [$rounds+$acc8],$acc8
472         srl     $t3,24,$acc12
473         and     $acc10,255,$acc10
474         ldub    [$rounds+$acc9],$acc9
475         srl     $t0,16,$acc13
476         and     $t1,255,$acc11
477         ldub    [$rounds+$acc10],$acc10         !
478         srl     $t1,8,$acc14
479         and     $acc13,255,$acc13
480         ldub    [$rounds+$acc11],$acc11
481         ldub    [$rounds+$acc12],$acc12
482         and     $acc14,255,$acc14
483         ldub    [$rounds+$acc13],$acc13
484         and     $t2,255,$acc15
485         ldub    [$rounds+$acc14],$acc14         !
486
487                 sll     $acc0,24,$acc0
488                 xor     $acc3,$s0,$s0
489         ldub    [$rounds+$acc15],$acc15
490                 sll     $acc1,16,$acc1
491                 xor     $acc0,$s0,$s0
492         ldx     [%sp+$bias+$frame+0],%i7        ! restore return address
493         fmovs   %f0,%f0
494                 sll     $acc2,8,$acc2           !
495                 xor     $acc1,$s0,$s0
496                 sll     $acc4,24,$acc4
497                 xor     $acc2,$s0,$s0
498                 sll     $acc5,16,$acc5
499                 xor     $acc7,$s1,$s1
500                 sll     $acc6,8,$acc6
501                 xor     $acc4,$s1,$s1
502                 sll     $acc8,24,$acc8          !
503                 xor     $acc5,$s1,$s1
504                 sll     $acc9,16,$acc9
505                 xor     $acc11,$s2,$s2
506                 sll     $acc10,8,$acc10
507                 xor     $acc6,$s1,$s1
508                 sll     $acc12,24,$acc12
509                 xor     $acc8,$s2,$s2
510                 sll     $acc13,16,$acc13        !
511                 xor     $acc9,$s2,$s2
512                 sll     $acc14,8,$acc14
513                 xor     $acc10,$s2,$s2
514                 xor     $acc12,$acc14,$acc14
515                 xor     $acc13,$s3,$s3
516                 xor     $acc14,$s3,$s3
517                 xor     $acc15,$s3,$s3
518
519         ret
520         restore
521 .type   _sparcv9_AES_encrypt,#function
522 .size   _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
523
524 .align  32
525 .globl  AES_encrypt
526 AES_encrypt:
527         or      %o0,%o1,%g1
528         andcc   %g1,3,%g0
529         bnz,pn  %xcc,.Lunaligned_enc
530         save    %sp,-$frame,%sp
531
532         ld      [%i0+0],%o0
533         ld      [%i0+4],%o1
534         ld      [%i0+8],%o2
535         ld      [%i0+12],%o3
536
537 1:      call    .+8
538         add     %o7,AES_Te-1b,%o4
539         call    _sparcv9_AES_encrypt
540         mov     %i2,%o5
541
542         st      %o0,[%i1+0]
543         st      %o1,[%i1+4]
544         st      %o2,[%i1+8]
545         st      %o3,[%i1+12]
546
547         ret
548         restore
549
550 .align  32
551 .Lunaligned_enc:
552         ldub    [%i0+0],%l0
553         ldub    [%i0+1],%l1
554         ldub    [%i0+2],%l2
555
556         sll     %l0,24,%l0
557         ldub    [%i0+3],%l3
558         sll     %l1,16,%l1
559         ldub    [%i0+4],%l4
560         sll     %l2,8,%l2
561         or      %l1,%l0,%l0
562         ldub    [%i0+5],%l5
563         sll     %l4,24,%l4
564         or      %l3,%l2,%l2
565         ldub    [%i0+6],%l6
566         sll     %l5,16,%l5
567         or      %l0,%l2,%o0
568         ldub    [%i0+7],%l7
569
570         sll     %l6,8,%l6
571         or      %l5,%l4,%l4
572         ldub    [%i0+8],%l0
573         or      %l7,%l6,%l6
574         ldub    [%i0+9],%l1
575         or      %l4,%l6,%o1
576         ldub    [%i0+10],%l2
577
578         sll     %l0,24,%l0
579         ldub    [%i0+11],%l3
580         sll     %l1,16,%l1
581         ldub    [%i0+12],%l4
582         sll     %l2,8,%l2
583         or      %l1,%l0,%l0
584         ldub    [%i0+13],%l5
585         sll     %l4,24,%l4
586         or      %l3,%l2,%l2
587         ldub    [%i0+14],%l6
588         sll     %l5,16,%l5
589         or      %l0,%l2,%o2
590         ldub    [%i0+15],%l7
591
592         sll     %l6,8,%l6
593         or      %l5,%l4,%l4
594         or      %l7,%l6,%l6
595         or      %l4,%l6,%o3
596
597 1:      call    .+8
598         add     %o7,AES_Te-1b,%o4
599         call    _sparcv9_AES_encrypt
600         mov     %i2,%o5
601
602         srl     %o0,24,%l0
603         srl     %o0,16,%l1
604         stb     %l0,[%i1+0]
605         srl     %o0,8,%l2
606         stb     %l1,[%i1+1]
607         stb     %l2,[%i1+2]
608         srl     %o1,24,%l4
609         stb     %o0,[%i1+3]
610
611         srl     %o1,16,%l5
612         stb     %l4,[%i1+4]
613         srl     %o1,8,%l6
614         stb     %l5,[%i1+5]
615         stb     %l6,[%i1+6]
616         srl     %o2,24,%l0
617         stb     %o1,[%i1+7]
618
619         srl     %o2,16,%l1
620         stb     %l0,[%i1+8]
621         srl     %o2,8,%l2
622         stb     %l1,[%i1+9]
623         stb     %l2,[%i1+10]
624         srl     %o3,24,%l4
625         stb     %o2,[%i1+11]
626
627         srl     %o3,16,%l5
628         stb     %l4,[%i1+12]
629         srl     %o3,8,%l6
630         stb     %l5,[%i1+13]
631         stb     %l6,[%i1+14]
632         stb     %o3,[%i1+15]
633
634         ret
635         restore
636 .type   AES_encrypt,#function
637 .size   AES_encrypt,(.-AES_encrypt)
638
639 ___
640
641 $code.=<<___;
642 .align  256
643 AES_Td:
644 ___
645 &_data_word(
646         0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
647         0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
648         0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
649         0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
650         0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
651         0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
652         0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
653         0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
654         0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
655         0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
656         0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
657         0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
658         0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
659         0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
660         0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
661         0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
662         0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
663         0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
664         0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
665         0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
666         0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
667         0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
668         0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
669         0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
670         0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
671         0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
672         0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
673         0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
674         0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
675         0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
676         0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
677         0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
678         0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
679         0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
680         0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
681         0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
682         0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
683         0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
684         0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
685         0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
686         0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
687         0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
688         0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
689         0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
690         0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
691         0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
692         0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
693         0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
694         0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
695         0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
696         0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
697         0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
698         0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
699         0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
700         0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
701         0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
702         0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
703         0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
704         0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
705         0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
706         0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
707         0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
708         0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
709         0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
710 $code.=<<___;
711         .byte   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
712         .byte   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
713         .byte   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
714         .byte   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
715         .byte   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
716         .byte   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
717         .byte   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
718         .byte   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
719         .byte   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
720         .byte   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
721         .byte   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
722         .byte   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
723         .byte   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
724         .byte   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
725         .byte   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
726         .byte   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
727         .byte   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
728         .byte   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
729         .byte   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
730         .byte   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
731         .byte   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
732         .byte   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
733         .byte   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
734         .byte   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
735         .byte   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
736         .byte   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
737         .byte   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
738         .byte   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
739         .byte   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
740         .byte   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
741         .byte   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
742         .byte   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
743 .type   AES_Td,#object
744 .size   AES_Td,(.-AES_Td)
745
746 .align  64
747 .skip   16
748 _sparcv9_AES_decrypt:
749         save    %sp,-$frame-$locals,%sp
750         stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
751         ld      [$key+240],$rounds
752         ld      [$key+0],$t0
753         ld      [$key+4],$t1                    !
754         ld      [$key+8],$t2
755         ld      [$key+12],$t3
756         srl     $rounds,1,$rounds
757         xor     $t0,$s0,$s0
758         ld      [$key+16],$t0
759         xor     $t1,$s1,$s1
760         ld      [$key+20],$t1
761         srl     $s0,21,$acc0                    !
762         xor     $t2,$s2,$s2
763         ld      [$key+24],$t2
764         xor     $t3,$s3,$s3
765         and     $acc0,2040,$acc0
766         ld      [$key+28],$t3
767         srl     $s3,13,$acc1
768         nop
769 .Ldec_loop:
770         srl     $s2,5,$acc2                     !
771         and     $acc1,2040,$acc1
772         ldx     [$tbl+$acc0],$acc0
773         sll     $s1,3,$acc3
774         and     $acc2,2040,$acc2
775         ldx     [$tbl+$acc1],$acc1
776         srl     $s1,21,$acc4
777         and     $acc3,2040,$acc3
778         ldx     [$tbl+$acc2],$acc2              !
779         srl     $s0,13,$acc5
780         and     $acc4,2040,$acc4
781         ldx     [$tbl+$acc3],$acc3
782         srl     $s3,5,$acc6
783         and     $acc5,2040,$acc5
784         ldx     [$tbl+$acc4],$acc4
785         fmovs   %f0,%f0
786         sll     $s2,3,$acc7                     !
787         and     $acc6,2040,$acc6
788         ldx     [$tbl+$acc5],$acc5
789         srl     $s2,21,$acc8
790         and     $acc7,2040,$acc7
791         ldx     [$tbl+$acc6],$acc6
792         srl     $s1,13,$acc9
793         and     $acc8,2040,$acc8
794         ldx     [$tbl+$acc7],$acc7              !
795         srl     $s0,5,$acc10
796         and     $acc9,2040,$acc9
797         ldx     [$tbl+$acc8],$acc8
798         sll     $s3,3,$acc11
799         and     $acc10,2040,$acc10
800         ldx     [$tbl+$acc9],$acc9
801         fmovs   %f0,%f0
802         srl     $s3,21,$acc12                   !
803         and     $acc11,2040,$acc11
804         ldx     [$tbl+$acc10],$acc10
805         srl     $s2,13,$acc13
806         and     $acc12,2040,$acc12
807         ldx     [$tbl+$acc11],$acc11
808         srl     $s1,5,$acc14
809         and     $acc13,2040,$acc13
810         ldx     [$tbl+$acc12],$acc12            !
811         sll     $s0,3,$acc15
812         and     $acc14,2040,$acc14
813         ldx     [$tbl+$acc13],$acc13
814         and     $acc15,2040,$acc15
815         add     $key,32,$key
816         ldx     [$tbl+$acc14],$acc14
817         fmovs   %f0,%f0
818         subcc   $rounds,1,$rounds               !
819         ldx     [$tbl+$acc15],$acc15
820         bz,a,pn %icc,.Ldec_last
821         add     $tbl,2048,$rounds
822
823                 srlx    $acc1,8,$acc1
824                 xor     $acc0,$t0,$t0
825         ld      [$key+0],$s0
826         fmovs   %f0,%f0
827                 srlx    $acc2,16,$acc2          !
828                 xor     $acc1,$t0,$t0
829         ld      [$key+4],$s1
830                 srlx    $acc3,24,$acc3
831                 xor     $acc2,$t0,$t0
832         ld      [$key+8],$s2
833                 srlx    $acc5,8,$acc5
834                 xor     $acc3,$t0,$t0
835         ld      [$key+12],$s3                   !
836                 srlx    $acc6,16,$acc6
837                 xor     $acc4,$t1,$t1
838         fmovs   %f0,%f0
839                 srlx    $acc7,24,$acc7
840                 xor     $acc5,$t1,$t1
841                 srlx    $acc9,8,$acc9
842                 xor     $acc6,$t1,$t1
843                 srlx    $acc10,16,$acc10        !
844                 xor     $acc7,$t1,$t1
845                 srlx    $acc11,24,$acc11
846                 xor     $acc8,$t2,$t2
847                 srlx    $acc13,8,$acc13
848                 xor     $acc9,$t2,$t2
849                 srlx    $acc14,16,$acc14
850                 xor     $acc10,$t2,$t2
851                 srlx    $acc15,24,$acc15        !
852                 xor     $acc11,$t2,$t2
853                 xor     $acc12,$acc14,$acc14
854                 xor     $acc13,$t3,$t3
855         srl     $t0,21,$acc0
856                 xor     $acc14,$t3,$t3
857                 xor     $acc15,$t3,$t3
858         srl     $t3,13,$acc1
859
860         and     $acc0,2040,$acc0                !
861         srl     $t2,5,$acc2
862         and     $acc1,2040,$acc1
863         ldx     [$tbl+$acc0],$acc0
864         sll     $t1,3,$acc3
865         and     $acc2,2040,$acc2
866         ldx     [$tbl+$acc1],$acc1
867         fmovs   %f0,%f0
868         srl     $t1,21,$acc4                    !
869         and     $acc3,2040,$acc3
870         ldx     [$tbl+$acc2],$acc2
871         srl     $t0,13,$acc5
872         and     $acc4,2040,$acc4
873         ldx     [$tbl+$acc3],$acc3
874         srl     $t3,5,$acc6
875         and     $acc5,2040,$acc5
876         ldx     [$tbl+$acc4],$acc4              !
877         sll     $t2,3,$acc7
878         and     $acc6,2040,$acc6
879         ldx     [$tbl+$acc5],$acc5
880         srl     $t2,21,$acc8
881         and     $acc7,2040,$acc7
882         ldx     [$tbl+$acc6],$acc6
883         fmovs   %f0,%f0
884         srl     $t1,13,$acc9                    !
885         and     $acc8,2040,$acc8
886         ldx     [$tbl+$acc7],$acc7
887         srl     $t0,5,$acc10
888         and     $acc9,2040,$acc9
889         ldx     [$tbl+$acc8],$acc8
890         sll     $t3,3,$acc11
891         and     $acc10,2040,$acc10
892         ldx     [$tbl+$acc9],$acc9              !
893         srl     $t3,21,$acc12
894         and     $acc11,2040,$acc11
895         ldx     [$tbl+$acc10],$acc10
896         srl     $t2,13,$acc13
897         and     $acc12,2040,$acc12
898         ldx     [$tbl+$acc11],$acc11
899         fmovs   %f0,%f0
900         srl     $t1,5,$acc14                    !
901         and     $acc13,2040,$acc13
902         ldx     [$tbl+$acc12],$acc12
903         sll     $t0,3,$acc15
904         and     $acc14,2040,$acc14
905         ldx     [$tbl+$acc13],$acc13
906                 srlx    $acc1,8,$acc1
907         and     $acc15,2040,$acc15
908         ldx     [$tbl+$acc14],$acc14            !
909
910                 srlx    $acc2,16,$acc2
911                 xor     $acc0,$s0,$s0
912         ldx     [$tbl+$acc15],$acc15
913                 srlx    $acc3,24,$acc3
914                 xor     $acc1,$s0,$s0
915         ld      [$key+16],$t0
916         fmovs   %f0,%f0
917                 srlx    $acc5,8,$acc5           !
918                 xor     $acc2,$s0,$s0
919         ld      [$key+20],$t1
920                 srlx    $acc6,16,$acc6
921                 xor     $acc3,$s0,$s0
922         ld      [$key+24],$t2
923                 srlx    $acc7,24,$acc7
924                 xor     $acc4,$s1,$s1
925         ld      [$key+28],$t3                   !
926                 srlx    $acc9,8,$acc9
927                 xor     $acc5,$s1,$s1
928         ldx     [$tbl+2048+0],%g0               ! prefetch td4
929                 srlx    $acc10,16,$acc10
930                 xor     $acc6,$s1,$s1
931         ldx     [$tbl+2048+32],%g0              ! prefetch td4
932                 srlx    $acc11,24,$acc11
933                 xor     $acc7,$s1,$s1
934         ldx     [$tbl+2048+64],%g0              ! prefetch td4
935                 srlx    $acc13,8,$acc13
936                 xor     $acc8,$s2,$s2
937         ldx     [$tbl+2048+96],%g0              ! prefetch td4
938                 srlx    $acc14,16,$acc14        !
939                 xor     $acc9,$s2,$s2
940         ldx     [$tbl+2048+128],%g0             ! prefetch td4
941                 srlx    $acc15,24,$acc15
942                 xor     $acc10,$s2,$s2
943         ldx     [$tbl+2048+160],%g0             ! prefetch td4
944         srl     $s0,21,$acc0
945                 xor     $acc11,$s2,$s2
946         ldx     [$tbl+2048+192],%g0             ! prefetch td4
947                 xor     $acc12,$acc14,$acc14
948                 xor     $acc13,$s3,$s3
949         ldx     [$tbl+2048+224],%g0             ! prefetch td4
950         and     $acc0,2040,$acc0                !
951                 xor     $acc14,$s3,$s3
952                 xor     $acc15,$s3,$s3
953         ba      .Ldec_loop
954         srl     $s3,13,$acc1
955
956 .align  32
957 .Ldec_last:
958                 srlx    $acc1,8,$acc1           !
959                 xor     $acc0,$t0,$t0
960         ld      [$key+0],$s0
961                 srlx    $acc2,16,$acc2
962                 xor     $acc1,$t0,$t0
963         ld      [$key+4],$s1
964                 srlx    $acc3,24,$acc3
965                 xor     $acc2,$t0,$t0
966         ld      [$key+8],$s2                    !
967                 srlx    $acc5,8,$acc5
968                 xor     $acc3,$t0,$t0
969         ld      [$key+12],$s3
970                 srlx    $acc6,16,$acc6
971                 xor     $acc4,$t1,$t1
972                 srlx    $acc7,24,$acc7
973                 xor     $acc5,$t1,$t1
974                 srlx    $acc9,8,$acc9           !
975                 xor     $acc6,$t1,$t1
976                 srlx    $acc10,16,$acc10
977                 xor     $acc7,$t1,$t1
978                 srlx    $acc11,24,$acc11
979                 xor     $acc8,$t2,$t2
980                 srlx    $acc13,8,$acc13
981                 xor     $acc9,$t2,$t2
982                 srlx    $acc14,16,$acc14        !
983                 xor     $acc10,$t2,$t2
984                 srlx    $acc15,24,$acc15
985                 xor     $acc11,$t2,$t2
986                 xor     $acc12,$acc14,$acc14
987                 xor     $acc13,$t3,$t3
988         srl     $t0,24,$acc0
989                 xor     $acc14,$t3,$t3
990                 xor     $acc15,$t3,$t3          !
991         srl     $t3,16,$acc1
992
993         srl     $t2,8,$acc2
994         and     $acc1,255,$acc1
995         ldub    [$rounds+$acc0],$acc0
996         srl     $t1,24,$acc4
997         and     $acc2,255,$acc2
998         ldub    [$rounds+$acc1],$acc1
999         srl     $t0,16,$acc5                    !
1000         and     $t1,255,$acc3
1001         ldub    [$rounds+$acc2],$acc2
1002         ldub    [$rounds+$acc3],$acc3
1003         srl     $t3,8,$acc6
1004         and     $acc5,255,$acc5
1005         ldub    [$rounds+$acc4],$acc4
1006         fmovs   %f0,%f0
1007         srl     $t2,24,$acc8                    !
1008         and     $acc6,255,$acc6
1009         ldub    [$rounds+$acc5],$acc5
1010         srl     $t1,16,$acc9
1011         and     $t2,255,$acc7
1012         ldub    [$rounds+$acc6],$acc6
1013         ldub    [$rounds+$acc7],$acc7
1014         fmovs   %f0,%f0
1015         srl     $t0,8,$acc10                    !
1016         and     $acc9,255,$acc9
1017         ldub    [$rounds+$acc8],$acc8
1018         srl     $t3,24,$acc12
1019         and     $acc10,255,$acc10
1020         ldub    [$rounds+$acc9],$acc9
1021         srl     $t2,16,$acc13
1022         and     $t3,255,$acc11
1023         ldub    [$rounds+$acc10],$acc10         !
1024         srl     $t1,8,$acc14
1025         and     $acc13,255,$acc13
1026         ldub    [$rounds+$acc11],$acc11
1027         ldub    [$rounds+$acc12],$acc12
1028         and     $acc14,255,$acc14
1029         ldub    [$rounds+$acc13],$acc13
1030         and     $t0,255,$acc15
1031         ldub    [$rounds+$acc14],$acc14         !
1032
1033                 sll     $acc0,24,$acc0
1034                 xor     $acc3,$s0,$s0
1035         ldub    [$rounds+$acc15],$acc15
1036                 sll     $acc1,16,$acc1
1037                 xor     $acc0,$s0,$s0
1038         ldx     [%sp+$bias+$frame+0],%i7        ! restore return address
1039         fmovs   %f0,%f0
1040                 sll     $acc2,8,$acc2           !
1041                 xor     $acc1,$s0,$s0
1042                 sll     $acc4,24,$acc4
1043                 xor     $acc2,$s0,$s0
1044                 sll     $acc5,16,$acc5
1045                 xor     $acc7,$s1,$s1
1046                 sll     $acc6,8,$acc6
1047                 xor     $acc4,$s1,$s1
1048                 sll     $acc8,24,$acc8          !
1049                 xor     $acc5,$s1,$s1
1050                 sll     $acc9,16,$acc9
1051                 xor     $acc11,$s2,$s2
1052                 sll     $acc10,8,$acc10
1053                 xor     $acc6,$s1,$s1
1054                 sll     $acc12,24,$acc12
1055                 xor     $acc8,$s2,$s2
1056                 sll     $acc13,16,$acc13        !
1057                 xor     $acc9,$s2,$s2
1058                 sll     $acc14,8,$acc14
1059                 xor     $acc10,$s2,$s2
1060                 xor     $acc12,$acc14,$acc14
1061                 xor     $acc13,$s3,$s3
1062                 xor     $acc14,$s3,$s3
1063                 xor     $acc15,$s3,$s3
1064
1065         ret
1066         restore
1067 .type   _sparcv9_AES_decrypt,#function
1068 .size   _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1069
1070 .align  32
1071 .globl  AES_decrypt
1072 AES_decrypt:
1073         or      %o0,%o1,%g1
1074         andcc   %g1,3,%g0
1075         bnz,pn  %xcc,.Lunaligned_dec
1076         save    %sp,-$frame,%sp
1077
1078         ld      [%i0+0],%o0
1079         ld      [%i0+4],%o1
1080         ld      [%i0+8],%o2
1081         ld      [%i0+12],%o3
1082
1083 1:      call    .+8
1084         add     %o7,AES_Td-1b,%o4
1085         call    _sparcv9_AES_decrypt
1086         mov     %i2,%o5
1087
1088         st      %o0,[%i1+0]
1089         st      %o1,[%i1+4]
1090         st      %o2,[%i1+8]
1091         st      %o3,[%i1+12]
1092
1093         ret
1094         restore
1095
1096 .align  32
1097 .Lunaligned_dec:
1098         ldub    [%i0+0],%l0
1099         ldub    [%i0+1],%l1
1100         ldub    [%i0+2],%l2
1101
1102         sll     %l0,24,%l0
1103         ldub    [%i0+3],%l3
1104         sll     %l1,16,%l1
1105         ldub    [%i0+4],%l4
1106         sll     %l2,8,%l2
1107         or      %l1,%l0,%l0
1108         ldub    [%i0+5],%l5
1109         sll     %l4,24,%l4
1110         or      %l3,%l2,%l2
1111         ldub    [%i0+6],%l6
1112         sll     %l5,16,%l5
1113         or      %l0,%l2,%o0
1114         ldub    [%i0+7],%l7
1115
1116         sll     %l6,8,%l6
1117         or      %l5,%l4,%l4
1118         ldub    [%i0+8],%l0
1119         or      %l7,%l6,%l6
1120         ldub    [%i0+9],%l1
1121         or      %l4,%l6,%o1
1122         ldub    [%i0+10],%l2
1123
1124         sll     %l0,24,%l0
1125         ldub    [%i0+11],%l3
1126         sll     %l1,16,%l1
1127         ldub    [%i0+12],%l4
1128         sll     %l2,8,%l2
1129         or      %l1,%l0,%l0
1130         ldub    [%i0+13],%l5
1131         sll     %l4,24,%l4
1132         or      %l3,%l2,%l2
1133         ldub    [%i0+14],%l6
1134         sll     %l5,16,%l5
1135         or      %l0,%l2,%o2
1136         ldub    [%i0+15],%l7
1137
1138         sll     %l6,8,%l6
1139         or      %l5,%l4,%l4
1140         or      %l7,%l6,%l6
1141         or      %l4,%l6,%o3
1142
1143 1:      call    .+8
1144         add     %o7,AES_Td-1b,%o4
1145         call    _sparcv9_AES_decrypt
1146         mov     %i2,%o5
1147
1148         srl     %o0,24,%l0
1149         srl     %o0,16,%l1
1150         stb     %l0,[%i1+0]
1151         srl     %o0,8,%l2
1152         stb     %l1,[%i1+1]
1153         stb     %l2,[%i1+2]
1154         srl     %o1,24,%l4
1155         stb     %o0,[%i1+3]
1156
1157         srl     %o1,16,%l5
1158         stb     %l4,[%i1+4]
1159         srl     %o1,8,%l6
1160         stb     %l5,[%i1+5]
1161         stb     %l6,[%i1+6]
1162         srl     %o2,24,%l0
1163         stb     %o1,[%i1+7]
1164
1165         srl     %o2,16,%l1
1166         stb     %l0,[%i1+8]
1167         srl     %o2,8,%l2
1168         stb     %l1,[%i1+9]
1169         stb     %l2,[%i1+10]
1170         srl     %o3,24,%l4
1171         stb     %o2,[%i1+11]
1172
1173         srl     %o3,16,%l5
1174         stb     %l4,[%i1+12]
1175         srl     %o3,8,%l6
1176         stb     %l5,[%i1+13]
1177         stb     %l6,[%i1+14]
1178         stb     %o3,[%i1+15]
1179
1180         ret
1181         restore
1182 .type   AES_decrypt,#function
1183 .size   AES_decrypt,(.-AES_decrypt)
1184 ___
1185
1186 # fmovs instructions substituting for FP nops were originally added
1187 # to meet specific instruction alignment requirements to maximize ILP.
1188 # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1189 # undesired effect, so just omit them and sacrifice some portion of
1190 # percent in performance...
1191 $code =~ s/fmovs.*$//gm;
1192
1193 print $code;
1194 close STDOUT or die "error closing STDOUT: $!"; # ensure flush