3 # ====================================================================
4 # Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
5 # <appro@openssl.org>. The module is licensed under 2-clause BSD
6 # license. October 2012. All rights reserved.
7 # ====================================================================
9 ######################################################################
10 # AES round instructions complete in 3 cycles and can be issued every
11 # cycle. It means that round calculations should take 4*rounds cycles,
12 # because any given round instruction depends on result of *both*
13 # previous instructions:
21 # Provided that fxor [with IV] takes 3 cycles to complete, critical
22 # path length for CBC encrypt would be 3+4*rounds, or in other words
23 # it should process one byte in at least (3+4*rounds)/16 cycles. This
24 # estimate doesn't account for "collateral" instructions, such as
25 # fetching input from memory, xor-ing it with zero-round key and
26 # storing the result. Yet, *measured* performance [for data aligned
27 # at 64-bit boundary!] deviates from this equation by less than 0.5%:
29 # 128-bit key 192- 256-
30 # CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90
31 # (*) numbers after slash are for
34 # Out-of-order execution logic managed to fully overlap "collateral"
35 # instructions with those on critical path. Amazing!
37 # As with Intel AES-NI, question is if it's possible to improve
38 # performance of parallelizeable modes by interleaving round
39 # instructions. Provided round instruction latency and throughput
40 # optimal interleave factor is 2. But can we expect 2x performance
41 # improvement? Well, as round instructions can be issued one per
42 # cycle, they don't saturate the 2-way issue pipeline and therefore
43 # there is room for "collateral" calculations... Yet, 2x speed-up
44 # over CBC encrypt remains unattaintable:
46 # 128-bit key 192- 256-
47 # CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61
48 # CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61
49 # (*) numbers after slash are for
52 # Estimates based on amount of instructions under assumption that
53 # round instructions are not pairable with any other instruction
54 # suggest that latter is the actual case and pipeline runs
55 # underutilized. It should be noted that T4 out-of-order execution
56 # logic is so capable that performance gain from 2x interleave is
57 # not even impressive, ~7-13% over non-interleaved code, largest
60 # To anchor to something else, software implementation processes
61 # one byte in 29 cycles with 128-bit key on same processor. Intel
62 # Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
63 # in 0.93, naturally with AES-NI.
66 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
67 if ($bits==64) { $bias=2047; $frame=192; }
68 else { $bias=0; $frame=112; }
70 $evp=1; # if $evp is set to 0, script generates module with
71 # AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
72 # points. These however are not fully compatible with openssl/aes.h,
73 # because they expect AES_KEY to be aligned at 64-bit boundary. When
74 # used through EVP, alignment is arranged at EVP layer. Second thing
75 # that is arranged by EVP is at least 32-bit alignment of IV.
77 ######################################################################
78 # single-round subroutines
81 my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
89 andcc $inp, 7, %g1 ! is input aligned?
108 ld [$key + 240], $rounds
109 ldd [$key + 16], %f12
110 ldd [$key + 24], %f14
115 srl $rounds, 1, $rounds
116 ldd [$key + 32], %f16
117 sub $rounds, 1, $rounds
118 ldd [$key + 40], %f18
122 aes_eround01 %f12, %f0, %f2, %f4
123 aes_eround23 %f14, %f0, %f2, %f2
126 sub $rounds,1,$rounds
127 aes_eround01 %f16, %f4, %f2, %f0
128 aes_eround23 %f18, %f4, %f2, %f2
129 ldd [$key + 16], %f16
130 ldd [$key + 24], %f18
131 brnz,pt $rounds, .Lenc
134 andcc $out, 7, $tmp ! is output aligned?
135 aes_eround01 %f12, %f0, %f2, %f4
136 aes_eround23 %f14, %f0, %f2, %f2
137 aes_eround01_l %f16, %f4, %f2, %f0
138 aes_eround23_l %f18, %f4, %f2, %f2
147 2: alignaddrl $out, %g0, $out
149 srl $mask, $tmp, $mask
151 faligndata %f0, %f0, %f4
152 faligndata %f0, %f2, %f6
153 faligndata %f2, %f2, %f8
155 stda %f4, [$out + $mask]0xc0 ! partial store
158 orn %g0, $mask, $mask
160 stda %f8, [$out + $mask]0xc0 ! partial store
161 .type aes_t4_encrypt,#function
162 .size aes_t4_encrypt,.-aes_t4_encrypt
164 .globl aes_t4_decrypt
167 andcc $inp, 7, %g1 ! is input aligned?
176 ldx [$inp + 16], $inp
186 ld [$key + 240], $rounds
187 ldd [$key + 16], %f12
188 ldd [$key + 24], %f14
193 srl $rounds, 1, $rounds
194 ldd [$key + 32], %f16
195 sub $rounds, 1, $rounds
196 ldd [$key + 40], %f18
200 aes_dround01 %f12, %f0, %f2, %f4
201 aes_dround23 %f14, %f0, %f2, %f2
204 sub $rounds,1,$rounds
205 aes_dround01 %f16, %f4, %f2, %f0
206 aes_dround23 %f18, %f4, %f2, %f2
207 ldd [$key + 16], %f16
208 ldd [$key + 24], %f18
209 brnz,pt $rounds, .Ldec
212 andcc $out, 7, $tmp ! is output aligned?
213 aes_dround01 %f12, %f0, %f2, %f4
214 aes_dround23 %f14, %f0, %f2, %f2
215 aes_dround01_l %f16, %f4, %f2, %f0
216 aes_dround23_l %f18, %f4, %f2, %f2
225 2: alignaddrl $out, %g0, $out
227 srl $mask, $tmp, $mask
229 faligndata %f0, %f0, %f4
230 faligndata %f0, %f2, %f6
231 faligndata %f2, %f2, %f8
233 stda %f4, [$out + $mask]0xc0 ! partial store
236 orn %g0, $mask, $mask
238 stda %f8, [$out + $mask]0xc0 ! partial store
239 .type aes_t4_decrypt,#function
240 .size aes_t4_decrypt,.-aes_t4_decrypt
244 ######################################################################
245 # key setup subroutines
248 my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
250 .globl aes_t4_set_encrypt_key
252 aes_t4_set_encrypt_key:
255 alignaddr $inp, %g0, $inp
263 brz,pt $tmp, .L256aligned
267 faligndata %f0, %f2, %f0
268 faligndata %f2, %f4, %f2
269 faligndata %f4, %f6, %f4
270 faligndata %f6, %f8, %f6
273 for ($i=0; $i<6; $i++) {
275 std %f0, [$out + `32*$i+0`]
276 aes_kexpand1 %f0, %f6, $i, %f0
277 std %f2, [$out + `32*$i+8`]
278 aes_kexpand2 %f2, %f0, %f2
279 std %f4, [$out + `32*$i+16`]
280 aes_kexpand0 %f4, %f2, %f4
281 std %f6, [$out + `32*$i+24`]
282 aes_kexpand2 %f6, %f4, %f6
286 std %f0, [$out + `32*$i+0`]
287 aes_kexpand1 %f0, %f6, $i, %f0
288 std %f2, [$out + `32*$i+8`]
289 aes_kexpand2 %f2, %f0, %f2
290 std %f4, [$out + `32*$i+16`]
291 std %f6, [$out + `32*$i+24`]
292 std %f0, [$out + `32*$i+32`]
293 std %f2, [$out + `32*$i+40`]
296 st $tmp, [$out + 240]
302 brz,pt $tmp, .L192aligned
306 faligndata %f0, %f2, %f0
307 faligndata %f2, %f4, %f2
308 faligndata %f4, %f6, %f4
311 for ($i=0; $i<7; $i++) {
313 std %f0, [$out + `24*$i+0`]
314 aes_kexpand1 %f0, %f4, $i, %f0
315 std %f2, [$out + `24*$i+8`]
316 aes_kexpand2 %f2, %f0, %f2
317 std %f4, [$out + `24*$i+16`]
318 aes_kexpand2 %f4, %f2, %f4
322 std %f0, [$out + `24*$i+0`]
323 aes_kexpand1 %f0, %f4, $i, %f0
324 std %f2, [$out + `24*$i+8`]
325 aes_kexpand2 %f2, %f0, %f2
326 std %f4, [$out + `24*$i+16`]
327 std %f0, [$out + `24*$i+24`]
328 std %f2, [$out + `24*$i+32`]
331 st $tmp, [$out + 240]
337 brz,pt $tmp, .L128aligned
341 faligndata %f0, %f2, %f0
342 faligndata %f2, %f4, %f2
345 for ($i=0; $i<10; $i++) {
347 std %f0, [$out + `16*$i+0`]
348 aes_kexpand1 %f0, %f2, $i, %f0
349 std %f2, [$out + `16*$i+8`]
350 aes_kexpand2 %f2, %f0, %f2
354 std %f0, [$out + `16*$i+0`]
355 std %f2, [$out + `16*$i+8`]
358 st $tmp, [$out + 240]
361 .type aes_t4_set_encrypt_key,#function
362 .size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
364 .globl aes_t4_set_decrypt_key
366 aes_t4_set_decrypt_key:
368 call .Lset_encrypt_key
372 sll $tmp, 4, $inp ! $tmp is number of rounds
374 add $out, $inp, $inp ! $inp=$out+16*rounds
375 srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4
384 ldd [$inp - 16], %f12
393 std %f12, [$out + 16]
394 std %f14, [$out + 24]
396 brnz $tmp, .Lkey_flip
401 .type aes_t4_set_decrypt_key,#function
402 .size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
407 my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
408 my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
416 for ($i=2; $i<22;$i++) { # load key schedule
418 ldd [$key + `8*$i`], %f`12+2*$i`
424 .type _aes128_loadkey,#function
425 .size _aes128_loadkey,.-_aes128_loadkey
430 for ($i=0; $i<4; $i++) {
432 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
433 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
434 aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
435 aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
439 aes_eround01 %f48, %f0, %f2, %f4
440 aes_eround23 %f50, %f0, %f2, %f2
441 aes_eround01_l %f52, %f4, %f2, %f0
443 aes_eround23_l %f54, %f4, %f2, %f2
444 .type _aes128_encrypt_1x,#function
445 .size _aes128_encrypt_1x,.-_aes128_encrypt_1x
450 for ($i=0; $i<4; $i++) {
452 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
453 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
454 aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
455 aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
456 aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
457 aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
458 aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
459 aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
463 aes_eround01 %f48, %f0, %f2, %f8
464 aes_eround23 %f50, %f0, %f2, %f2
465 aes_eround01 %f48, %f4, %f6, %f10
466 aes_eround23 %f50, %f4, %f6, %f6
467 aes_eround01_l %f52, %f8, %f2, %f0
468 aes_eround23_l %f54, %f8, %f2, %f2
469 aes_eround01_l %f52, %f10, %f6, %f4
471 aes_eround23_l %f54, %f10, %f6, %f6
472 .type _aes128_encrypt_2x,#function
473 .size _aes128_encrypt_2x,.-_aes128_encrypt_2x
478 for ($i=0; $i<4; $i++) {
480 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
481 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
482 aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
483 aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
487 aes_dround01 %f48, %f0, %f2, %f4
488 aes_dround23 %f50, %f0, %f2, %f2
489 aes_dround01_l %f52, %f4, %f2, %f0
491 aes_dround23_l %f54, %f4, %f2, %f2
492 .type _aes128_decrypt_1x,#function
493 .size _aes128_decrypt_1x,.-_aes128_decrypt_1x
498 for ($i=0; $i<4; $i++) {
500 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
501 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
502 aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
503 aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
504 aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
505 aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
506 aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
507 aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
511 aes_dround01 %f48, %f0, %f2, %f8
512 aes_dround23 %f50, %f0, %f2, %f2
513 aes_dround01 %f48, %f4, %f6, %f10
514 aes_dround23 %f50, %f4, %f6, %f6
515 aes_dround01_l %f52, %f8, %f2, %f0
516 aes_dround23_l %f54, %f8, %f2, %f2
517 aes_dround01_l %f52, %f10, %f6, %f4
519 aes_dround23_l %f54, %f10, %f6, %f6
520 .type _aes128_decrypt_2x,#function
521 .size _aes128_decrypt_2x,.-_aes128_decrypt_2x
529 for ($i=2; $i<26;$i++) { # load key schedule
531 ldd [$key + `8*$i`], %f`12+2*$i`
537 .type _aes192_loadkey,#function
538 .size _aes192_loadkey,.-_aes192_loadkey
543 for ($i=0; $i<5; $i++) {
545 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
546 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
547 aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
548 aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
552 aes_eround01 %f56, %f0, %f2, %f4
553 aes_eround23 %f58, %f0, %f2, %f2
554 aes_eround01_l %f60, %f4, %f2, %f0
556 aes_eround23_l %f62, %f4, %f2, %f2
557 .type _aes192_encrypt_1x,#function
558 .size _aes192_encrypt_1x,.-_aes192_encrypt_1x
563 for ($i=0; $i<5; $i++) {
565 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
566 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
567 aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
568 aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
569 aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
570 aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
571 aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
572 aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
576 aes_eround01 %f56, %f0, %f2, %f8
577 aes_eround23 %f58, %f0, %f2, %f2
578 aes_eround01 %f56, %f4, %f6, %f10
579 aes_eround23 %f58, %f4, %f6, %f6
580 aes_eround01_l %f60, %f8, %f2, %f0
581 aes_eround23_l %f62, %f8, %f2, %f2
582 aes_eround01_l %f60, %f10, %f6, %f4
584 aes_eround23_l %f62, %f10, %f6, %f6
585 .type _aes192_encrypt_2x,#function
586 .size _aes192_encrypt_2x,.-_aes192_encrypt_2x
591 for ($i=0; $i<5; $i++) {
593 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
594 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
595 aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
596 aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
600 aes_dround01 %f56, %f0, %f2, %f4
601 aes_dround23 %f58, %f0, %f2, %f2
602 aes_dround01_l %f60, %f4, %f2, %f0
604 aes_dround23_l %f62, %f4, %f2, %f2
605 .type _aes192_decrypt_1x,#function
606 .size _aes192_decrypt_1x,.-_aes192_decrypt_1x
611 for ($i=0; $i<5; $i++) {
613 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
614 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
615 aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
616 aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
617 aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
618 aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
619 aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
620 aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
624 aes_dround01 %f56, %f0, %f2, %f8
625 aes_dround23 %f58, %f0, %f2, %f2
626 aes_dround01 %f56, %f4, %f6, %f10
627 aes_dround23 %f58, %f4, %f6, %f6
628 aes_dround01_l %f60, %f8, %f2, %f0
629 aes_dround23_l %f62, %f8, %f2, %f2
630 aes_dround01_l %f60, %f10, %f6, %f4
632 aes_dround23_l %f62, %f10, %f6, %f6
633 .type _aes192_decrypt_2x,#function
634 .size _aes192_decrypt_2x,.-_aes192_decrypt_2x
638 aes_eround01 %f16, %f0, %f2, %f4
639 aes_eround23 %f18, %f0, %f2, %f2
640 ldd [$key + 208], %f16
641 ldd [$key + 216], %f18
642 aes_eround01 %f20, %f4, %f2, %f0
643 aes_eround23 %f22, %f4, %f2, %f2
644 ldd [$key + 224], %f20
645 ldd [$key + 232], %f22
647 for ($i=1; $i<6; $i++) {
649 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
650 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
651 aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
652 aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
656 aes_eround01 %f16, %f0, %f2, %f4
657 aes_eround23 %f18, %f0, %f2, %f2
658 ldd [$key + 16], %f16
659 ldd [$key + 24], %f18
660 aes_eround01_l %f20, %f4, %f2, %f0
661 aes_eround23_l %f22, %f4, %f2, %f2
662 ldd [$key + 32], %f20
664 ldd [$key + 40], %f22
665 .type _aes256_encrypt_1x,#function
666 .size _aes256_encrypt_1x,.-_aes256_encrypt_1x
670 aes_eround01 %f16, %f0, %f2, %f8
671 aes_eround23 %f18, %f0, %f2, %f2
672 aes_eround01 %f16, %f4, %f6, %f10
673 aes_eround23 %f18, %f4, %f6, %f6
674 ldd [$key + 208], %f16
675 ldd [$key + 216], %f18
676 aes_eround01 %f20, %f8, %f2, %f0
677 aes_eround23 %f22, %f8, %f2, %f2
678 aes_eround01 %f20, %f10, %f6, %f4
679 aes_eround23 %f22, %f10, %f6, %f6
680 ldd [$key + 224], %f20
681 ldd [$key + 232], %f22
683 for ($i=1; $i<6; $i++) {
685 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
686 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
687 aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
688 aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
689 aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
690 aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
691 aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
692 aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
696 aes_eround01 %f16, %f0, %f2, %f8
697 aes_eround23 %f18, %f0, %f2, %f2
698 aes_eround01 %f16, %f4, %f6, %f10
699 aes_eround23 %f18, %f4, %f6, %f6
700 ldd [$key + 16], %f16
701 ldd [$key + 24], %f18
702 aes_eround01_l %f20, %f8, %f2, %f0
703 aes_eround23_l %f22, %f8, %f2, %f2
704 aes_eround01_l %f20, %f10, %f6, %f4
705 aes_eround23_l %f22, %f10, %f6, %f6
706 ldd [$key + 32], %f20
708 ldd [$key + 40], %f22
709 .type _aes256_encrypt_2x,#function
710 .size _aes256_encrypt_2x,.-_aes256_encrypt_2x
714 aes_dround01 %f16, %f0, %f2, %f4
715 aes_dround23 %f18, %f0, %f2, %f2
716 ldd [$key + 208], %f16
717 ldd [$key + 216], %f18
718 aes_dround01 %f20, %f4, %f2, %f0
719 aes_dround23 %f22, %f4, %f2, %f2
720 ldd [$key + 224], %f20
721 ldd [$key + 232], %f22
723 for ($i=1; $i<6; $i++) {
725 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
726 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
727 aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
728 aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
732 aes_dround01 %f16, %f0, %f2, %f4
733 aes_dround23 %f18, %f0, %f2, %f2
734 ldd [$key + 16], %f16
735 ldd [$key + 24], %f18
736 aes_dround01_l %f20, %f4, %f2, %f0
737 aes_dround23_l %f22, %f4, %f2, %f2
738 ldd [$key + 32], %f20
740 ldd [$key + 40], %f22
741 .type _aes256_decrypt_1x,#function
742 .size _aes256_decrypt_1x,.-_aes256_decrypt_1x
746 aes_dround01 %f16, %f0, %f2, %f8
747 aes_dround23 %f18, %f0, %f2, %f2
748 aes_dround01 %f16, %f4, %f6, %f10
749 aes_dround23 %f18, %f4, %f6, %f6
750 ldd [$key + 208], %f16
751 ldd [$key + 216], %f18
752 aes_dround01 %f20, %f8, %f2, %f0
753 aes_dround23 %f22, %f8, %f2, %f2
754 aes_dround01 %f20, %f10, %f6, %f4
755 aes_dround23 %f22, %f10, %f6, %f6
756 ldd [$key + 224], %f20
757 ldd [$key + 232], %f22
759 for ($i=1; $i<6; $i++) {
761 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
762 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
763 aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
764 aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
765 aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
766 aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
767 aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
768 aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
772 aes_dround01 %f16, %f0, %f2, %f8
773 aes_dround23 %f18, %f0, %f2, %f2
774 aes_dround01 %f16, %f4, %f6, %f10
775 aes_dround23 %f18, %f4, %f6, %f6
776 ldd [$key + 16], %f16
777 ldd [$key + 24], %f18
778 aes_dround01_l %f20, %f8, %f2, %f0
779 aes_dround23_l %f22, %f8, %f2, %f2
780 aes_dround01_l %f20, %f10, %f6, %f4
781 aes_dround23_l %f22, %f10, %f6, %f6
782 ldd [$key + 32], %f20
784 ldd [$key + 40], %f22
785 .type _aes256_decrypt_2x,#function
786 .size _aes256_decrypt_2x,.-_aes256_decrypt_2x
789 sub aes_cbc_encrypt_implement {
793 .globl aes${bits}_t4_cbc_encrypt
795 aes${bits}_t4_cbc_encrypt:
796 save %sp, -$frame, %sp
798 $code.=<<___ if (!$evp);
799 andcc $ivec, 7, $ivoff
800 alignaddr $ivec, %g0, $ivec
802 ldd [$ivec + 0], %f0 ! load ivec
805 ldd [$ivec + 16], %f4
806 faligndata %f0, %f2, %f0
807 faligndata %f2, %f4, %f2
810 $code.=<<___ if ($evp);
817 call _aes${bits}_loadkey
821 sll $ileft, 3, $ileft
824 sub $iright, $ileft, $iright
826 alignaddrl $out, %g0, $out
827 srl $omask, $ooff, $omask
829 .L${bits}_cbc_enc_loop:
835 sllx %o0, $ileft, %o0
836 srlx %o1, $iright, %g1
837 sllx %o1, $ileft, %o1
839 srlx %o2, $iright, %o2
842 xor %g4, %o0, %o0 ! ^= rk[0]
847 fxor %f12, %f0, %f0 ! ^= ivec
849 call _aes${bits}_encrypt_1x
857 brnz,pt $len, .L${bits}_cbc_enc_loop
860 $code.=<<___ if ($evp);
866 $code.=<<___ if (!$evp);
870 std %f0, [$ivec + 0] ! write out ivec
878 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
879 ! and ~3x deterioration
881 faligndata %f0, %f0, %f4 ! handle unaligned output
882 faligndata %f0, %f2, %f6
883 faligndata %f2, %f2, %f8
885 stda %f4, [$out + $omask]0xc0 ! partial store
888 orn %g0, $omask, $omask
889 stda %f8, [$out + $omask]0xc0 ! partial store
891 brnz,pt $len, .L${bits}_cbc_enc_loop+4
892 orn %g0, $omask, $omask
894 $code.=<<___ if ($evp);
900 $code.=<<___ if (!$evp);
904 std %f0, [$ivec + 0] ! write out ivec
910 3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
912 srl $omask, $ivoff, $omask
913 faligndata %f0, %f0, %f4
914 faligndata %f0, %f2, %f6
915 faligndata %f2, %f2, %f8
916 stda %f4, [$ivec + $omask]0xc0
919 orn %g0, $omask, $omask
920 stda %f8, [$ivec + $omask]0xc0
925 .type aes${bits}_t4_cbc_encrypt,#function
926 .size aes${bits}_t4_cbc_encrypt,.-aes${bits}_t4_cbc_encrypt
930 &aes_cbc_encrypt_implement(128);
931 &aes_cbc_encrypt_implement(192);
932 &aes_cbc_encrypt_implement(256);
934 sub aes_cbc_decrypt_implement {
938 .globl aes${bits}_t4_cbc_decrypt
940 aes${bits}_t4_cbc_decrypt:
941 save %sp, -$frame, %sp
943 $code.=<<___ if (!$evp);
944 andcc $ivec, 7, $ivoff
945 alignaddr $ivec, %g0, $ivec
947 ldd [$ivec + 0], %f12 ! load ivec
949 ldd [$ivec + 8], %f14
950 ldd [$ivec + 16], %f0
951 faligndata %f12, %f14, %f12
952 faligndata %f14, %f0, %f14
955 $code.=<<___ if ($evp);
956 ld [$ivec + 0], %f12 ! load ivec
959 ld [$ivec + 12], %f15
962 call _aes${bits}_loadkey
964 andcc $len, 1, %g0 ! is number of blocks even?
967 sll $ileft, 3, $ileft
970 sub $iright, $ileft, $iright
972 alignaddrl $out, %g0, $out
973 bz %icc, .L${bits}_cbc_dec_loop2x
974 srl $omask, $ooff, $omask
975 .L${bits}_cbc_dec_loop:
981 sllx %o0, $ileft, %o0
982 srlx %o1, $iright, %g1
983 sllx %o1, $ileft, %o1
985 srlx %o2, $iright, %o2
988 xor %g4, %o0, %o2 ! ^= rk[0]
993 call _aes${bits}_decrypt_1x
996 fxor %f12, %f0, %f0 ! ^= ivec
1006 brnz,pt $len, .L${bits}_cbc_dec_loop2x
1009 $code.=<<___ if ($evp);
1010 st %f12, [$ivec + 0]
1011 st %f13, [$ivec + 4]
1012 st %f14, [$ivec + 8]
1013 st %f15, [$ivec + 12]
1015 $code.=<<___ if (!$evp);
1016 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
1019 std %f12, [$ivec + 0] ! write out ivec
1020 std %f14, [$ivec + 8]
1027 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
1028 ! and ~3x deterioration
1030 faligndata %f0, %f0, %f4 ! handle unaligned output
1031 faligndata %f0, %f2, %f6
1032 faligndata %f2, %f2, %f8
1034 stda %f4, [$out + $omask]0xc0 ! partial store
1037 orn %g0, $omask, $omask
1038 stda %f8, [$out + $omask]0xc0 ! partial store
1040 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
1041 orn %g0, $omask, $omask
1043 $code.=<<___ if ($evp);
1044 st %f12, [$ivec + 0]
1045 st %f13, [$ivec + 4]
1046 st %f14, [$ivec + 8]
1047 st %f15, [$ivec + 12]
1049 $code.=<<___ if (!$evp);
1050 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
1053 std %f12, [$ivec + 0] ! write out ivec
1054 std %f14, [$ivec + 8]
1060 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1062 .L${bits}_cbc_dec_loop2x:
1065 ldx [$inp + 16], %o2
1067 ldx [$inp + 24], %o3
1069 ldx [$inp + 32], %o4
1070 sllx %o0, $ileft, %o0
1071 srlx %o1, $iright, %g1
1073 sllx %o1, $ileft, %o1
1074 srlx %o2, $iright, %g1
1076 sllx %o2, $ileft, %o2
1077 srlx %o3, $iright, %g1
1079 sllx %o3, $ileft, %o3
1080 srlx %o4, $iright, %o4
1083 xor %g4, %o0, %o4 ! ^= rk[0]
1092 call _aes${bits}_decrypt_2x
1097 fxor %f12, %f0, %f0 ! ^= ivec
1109 std %f4, [$out + 16]
1110 std %f6, [$out + 24]
1111 brnz,pt $len, .L${bits}_cbc_dec_loop2x
1114 $code.=<<___ if ($evp);
1115 st %f12, [$ivec + 0]
1116 st %f13, [$ivec + 4]
1117 st %f14, [$ivec + 8]
1118 st %f15, [$ivec + 12]
1120 $code.=<<___ if (!$evp);
1121 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
1124 std %f12, [$ivec + 0] ! write out ivec
1125 std %f14, [$ivec + 8]
1132 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
1133 ! and ~3x deterioration
1135 faligndata %f0, %f0, %f8 ! handle unaligned output
1136 faligndata %f0, %f2, %f0
1137 faligndata %f2, %f4, %f2
1138 faligndata %f4, %f6, %f4
1139 faligndata %f6, %f6, %f6
1140 stda %f8, [$out + $omask]0xc0 ! partial store
1142 std %f2, [$out + 16]
1143 std %f4, [$out + 24]
1145 orn %g0, $omask, $omask
1146 stda %f6, [$out + $omask]0xc0 ! partial store
1148 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
1149 orn %g0, $omask, $omask
1151 $code.=<<___ if ($evp);
1152 st %f12, [$ivec + 0]
1153 st %f13, [$ivec + 4]
1154 st %f14, [$ivec + 8]
1155 st %f15, [$ivec + 12]
1157 $code.=<<___ if (!$evp);
1158 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
1161 std %f12, [$ivec + 0] ! write out ivec
1162 std %f14, [$ivec + 8]
1167 .L${bits}_cbc_dec_unaligned_ivec:
1168 alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
1170 srl $omask, $ivoff, $omask
1171 faligndata %f12, %f12, %f0
1172 faligndata %f12, %f14, %f2
1173 faligndata %f14, %f14, %f4
1174 stda %f0, [$ivec + $omask]0xc0
1175 std %f2, [$ivec + 8]
1176 add $ivec, 16, $ivec
1177 orn %g0, $omask, $omask
1178 stda %f4, [$ivec + $omask]0xc0
1183 .type aes${bits}_t4_cbc_decrypt,#function
1184 .size aes${bits}_t4_cbc_decrypt,.-aes${bits}_t4_cbc_decrypt
1188 &aes_cbc_decrypt_implement(128);
1189 &aes_cbc_decrypt_implement(192);
1190 &aes_cbc_decrypt_implement(256);
1192 sub aes_ctr32_implement {
1196 .globl aes${bits}_t4_ctr32_encrypt
1198 aes${bits}_t4_ctr32_encrypt:
1199 save %sp, -$frame, %sp
1201 call _aes${bits}_loadkey
1204 ld [$ivec + 0], %l4 ! counter
1207 ld [$ivec + 12], %l7
1212 xor %o5, %g4, %g4 ! ^= rk[0]
1214 movxtod %g4, %f14 ! most significant 64 bits
1216 andcc $len, 1, %g0 ! is number of blocks even?
1219 sll $ileft, 3, $ileft
1222 sub $iright, $ileft, $iright
1224 alignaddrl $out, %g0, $out
1225 bz %icc, .L${bits}_ctr32_loop2x
1226 srl $omask, $ooff, $omask
1227 .L${bits}_ctr32_loop:
1232 ldx [$inp + 16], %o2
1233 sllx %o0, $ileft, %o0
1234 srlx %o1, $iright, %g1
1235 sllx %o1, $ileft, %o1
1237 srlx %o2, $iright, %o2
1240 xor %g5, %l7, %g1 ! ^= rk[0]
1243 srl %l7, 0, %l7 ! clruw
1245 aes_eround01 %f16, %f14, %f2, %f4
1246 aes_eround23 %f18, %f14, %f2, %f2
1247 call _aes${bits}_encrypt_1x+8
1252 fxor %f10, %f0, %f0 ! ^= inp
1260 brnz,pt $len, .L${bits}_ctr32_loop2x
1267 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
1268 ! and ~3x deterioration
1270 faligndata %f0, %f0, %f4 ! handle unaligned output
1271 faligndata %f0, %f2, %f6
1272 faligndata %f2, %f2, %f8
1273 stda %f4, [$out + $omask]0xc0 ! partial store
1276 orn %g0, $omask, $omask
1277 stda %f8, [$out + $omask]0xc0 ! partial store
1279 brnz,pt $len, .L${bits}_ctr32_loop2x+4
1280 orn %g0, $omask, $omask
1285 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1287 .L${bits}_ctr32_loop2x:
1290 ldx [$inp + 16], %o2
1292 ldx [$inp + 24], %o3
1294 ldx [$inp + 32], %o4
1295 sllx %o0, $ileft, %o0
1296 srlx %o1, $iright, %g1
1298 sllx %o1, $ileft, %o1
1299 srlx %o2, $iright, %g1
1301 sllx %o2, $ileft, %o2
1302 srlx %o3, $iright, %g1
1304 sllx %o3, $ileft, %o3
1305 srlx %o4, $iright, %o4
1308 xor %g5, %l7, %g1 ! ^= rk[0]
1311 srl %l7, 0, %l7 ! clruw
1315 srl %l7, 0, %l7 ! clruw
1317 aes_eround01 %f16, %f14, %f2, %f8
1318 aes_eround23 %f18, %f14, %f2, %f2
1319 aes_eround01 %f16, %f14, %f6, %f10
1320 aes_eround23 %f18, %f14, %f6, %f6
1321 call _aes${bits}_encrypt_2x+16
1327 fxor %f8, %f0, %f0 ! ^= inp
1338 std %f4, [$out + 16]
1339 std %f6, [$out + 24]
1340 brnz,pt $len, .L${bits}_ctr32_loop2x
1347 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
1348 ! and ~3x deterioration
1350 faligndata %f0, %f0, %f8 ! handle unaligned output
1351 faligndata %f0, %f2, %f0
1352 faligndata %f2, %f4, %f2
1353 faligndata %f4, %f6, %f4
1354 faligndata %f6, %f6, %f6
1356 stda %f8, [$out + $omask]0xc0 ! partial store
1358 std %f2, [$out + 16]
1359 std %f4, [$out + 24]
1361 orn %g0, $omask, $omask
1362 stda %f6, [$out + $omask]0xc0 ! partial store
1364 brnz,pt $len, .L${bits}_ctr32_loop2x+4
1365 orn %g0, $omask, $omask
1369 .type aes${bits}_t4_ctr32_encrypt,#function
1370 .size aes${bits}_t4_ctr32_encrypt,.-aes${bits}_t4_ctr32_encrypt
1375 &aes_ctr32_implement(128);
1376 &aes_ctr32_implement(192);
1377 &aes_ctr32_implement(256);
1384 AES_encrypt=aes_t4_encrypt
1386 AES_decrypt=aes_t4_decrypt
1387 .global AES_set_encrypt_key
1388 AES_set_encrypt_key=aes_t4_set_encrypt_key
1389 .global AES_set_decrypt_key
1390 AES_set_decrypt_key=aes_t4_set_decrypt_key
1393 my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
1396 .globl AES_cbc_encrypt
1399 ld [$key + 240], %g1
1401 brz $enc, .Lcbc_decrypt
1404 bl,pt %icc, aes128_t4_cbc_encrypt
1406 be,pn %icc, aes192_t4_cbc_encrypt
1408 ba aes256_t4_cbc_encrypt
1412 bl,pt %icc, aes128_t4_cbc_decrypt
1414 be,pn %icc, aes192_t4_cbc_decrypt
1416 ba aes256_t4_cbc_decrypt
1418 .type AES_cbc_encrypt,#function
1419 .size AES_cbc_encrypt,.-AES_cbc_encrypt
1423 .asciz "AES for SPARC T4, David S. Miller, Andy Polyakov"
1426 # Purpose of these subroutines is to explicitly encode VIS instructions,
1427 # so that one can compile the module without having to specify VIS
1428 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1429 # Idea is to reserve for option to produce "universal" binary and let
1430 # programmer detect if current CPU is VIS capable at run-time.
1432 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1434 my %visopf = ( "faligndata" => 0x048,
1437 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1439 if ($opf=$visopf{$mnemonic}) {
1440 foreach ($rs1,$rs2,$rd) {
1441 return $ref if (!/%f([0-9]{1,2})/);
1444 return $ref if ($1&1);
1445 # re-encode for upper double register addressing
1450 return sprintf ".word\t0x%08x !%s",
1451 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1458 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1459 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1460 my $ref = "$mnemonic\t$rs1,$rs2,$rd";
1461 my $opf = $mnemonic =~ /l$/ ? 0x01a :0x18;
1463 foreach ($rs1,$rs2,$rd) {
1464 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
1465 else { return $ref; }
1467 return sprintf ".word\t0x%08x !%s",
1468 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1472 sub unaes_round { # 4-argument instructions
1473 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1475 my %aesopf = ( "aes_eround01" => 0,
1476 "aes_eround23" => 1,
1477 "aes_dround01" => 2,
1478 "aes_dround23" => 3,
1479 "aes_eround01_l"=> 4,
1480 "aes_eround23_l"=> 5,
1481 "aes_dround01_l"=> 6,
1482 "aes_dround23_l"=> 7,
1483 "aes_kexpand1" => 8 );
1485 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1487 if (defined($opf=$aesopf{$mnemonic})) {
1488 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1489 foreach ($rs1,$rs2,$rd) {
1490 return $ref if (!/%f([0-9]{1,2})/);
1493 return $ref if ($1&1);
1494 # re-encode for upper double register addressing
1499 return sprintf ".word\t0x%08x !%s",
1500 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1507 sub unaes_kexpand { # 3-argument instructions
1508 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1510 my %aesopf = ( "aes_kexpand0" => 0x130,
1511 "aes_kexpand2" => 0x131 );
1513 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1515 if (defined($opf=$aesopf{$mnemonic})) {
1516 foreach ($rs1,$rs2,$rd) {
1517 return $ref if (!/%f([0-9]{1,2})/);
1520 return $ref if ($1&1);
1521 # re-encode for upper double register addressing
1526 return sprintf ".word\t0x%08x !%s",
1527 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1534 sub unmovxtox { # 2-argument instructions
1535 my ($mnemonic,$rs,$rd)=@_;
1536 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1538 my %movxopf = ( "movdtox" => 0x110,
1539 "movstouw" => 0x111,
1540 "movstosw" => 0x113,
1542 "movwtos" => 0x119 );
1544 $ref = "$mnemonic\t$rs,$rd";
1546 if (defined($opf=$movxopf{$mnemonic})) {
1548 return $ref if (!/%([fgoli])([0-9]{1,2})/);
1551 return $ref if ($2&1);
1552 # re-encode for upper double register addressing
1557 return sprintf ".word\t0x%08x !%s",
1558 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1565 foreach (split("\n",$code)) {
1566 s/\`([^\`]*)\`/eval $1/ge;
1568 s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1569 &unaes_round($1,$2,$3,$4,$5)
1571 s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1572 &unaes_kexpand($1,$2,$3,$4)
1574 s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1575 &unmovxtox($1,$2,$3)
1577 s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1578 &unmovxtox($1,$2,$3)
1580 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1583 s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1584 &unalignaddr($1,$2,$3,$4)