2 # Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # Initial support for Fujitsu SPARC64 X/X+ comprises minimally
20 # required key setup and single-block procedures.
24 # Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means
25 # that parallelizable nature of CBC decrypt and CTR is not utilized
26 # yet. CBC encrypt on the other hand is as good as it can possibly
27 # get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X.
28 # This is ~6x faster than pure software implementation...
32 # Switch from faligndata to fshiftorx, which allows to omit alignaddr
33 # instructions and improve single-block and short-input performance
34 # with misaligned data.
36 $output = pop and open STDOUT,">$output";
39 my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5));
43 # define __ASSEMBLER__ 1
45 #include "crypto/sparc_arch.h"
47 #define LOCALS (STACK_BIAS+STACK_FRAME)
54 and $inp, 7, $tmp ! is input aligned?
56 ldd [$key + 0], %f6 ! round[0]
59 ld [$key + 240], $rounds
62 add %o7, .Linp_align-1b, %o7
65 ldd [$inp + 0], %f0 ! load input
66 brz,pt $tmp, .Lenc_inp_aligned
69 ldd [%o7 + $tmp], %f14 ! shift left params
71 fshiftorx %f0, %f2, %f14, %f0
72 fshiftorx %f2, %f4, %f14, %f2
75 ldd [$key + 16], %f10 ! round[1]
78 fxor %f0, %f6, %f0 ! ^=round[0]
80 ldd [$key + 32], %f6 ! round[2]
83 sub $rounds, 4, $rounds
87 faesencx %f2, %f10, %f0
88 faesencx %f4, %f12, %f2
94 faesencx %f2, %f6, %f0
95 faesencx %f4, %f8, %f2
99 brnz,a $rounds, .Loop_enc
100 sub $rounds, 2, $rounds
102 andcc $out, 7, $tmp ! is output aligned?
105 srl $mask, $tmp, $mask
110 faesencx %f2, %f10, %f0
111 faesencx %f4, %f12, %f2
112 ldd [%o7 + $tmp], %f14 ! shift right params
115 faesenclx %f2, %f6, %f0
116 faesenclx %f4, %f8, %f2
118 bnz,pn %icc, .Lenc_out_unaligned
129 fshiftorx %f0, %f0, %f14, %f4
130 fshiftorx %f0, %f2, %f14, %f6
131 fshiftorx %f2, %f2, %f14, %f8
133 stda %f4, [$out + $mask]0xc0 ! partial store
135 stda %f8, [$inp + $tmp]0xc0 ! partial store
138 .type aes_fx_encrypt,#function
139 .size aes_fx_encrypt,.-aes_fx_encrypt
141 .globl aes_fx_decrypt
144 and $inp, 7, $tmp ! is input aligned?
146 ldd [$key + 0], %f6 ! round[0]
149 ld [$key + 240], $rounds
152 add %o7, .Linp_align-1b, %o7
155 ldd [$inp + 0], %f0 ! load input
156 brz,pt $tmp, .Ldec_inp_aligned
159 ldd [%o7 + $tmp], %f14 ! shift left params
161 fshiftorx %f0, %f2, %f14, %f0
162 fshiftorx %f2, %f4, %f14, %f2
165 ldd [$key + 16], %f10 ! round[1]
166 ldd [$key + 24], %f12
168 fxor %f0, %f6, %f0 ! ^=round[0]
170 ldd [$key + 32], %f6 ! round[2]
173 sub $rounds, 4, $rounds
177 faesdecx %f2, %f10, %f0
178 faesdecx %f4, %f12, %f2
179 ldd [$key + 16], %f10
180 ldd [$key + 24], %f12
184 faesdecx %f2, %f6, %f0
185 faesdecx %f4, %f8, %f2
189 brnz,a $rounds, .Loop_dec
190 sub $rounds, 2, $rounds
192 andcc $out, 7, $tmp ! is output aligned?
195 srl $mask, $tmp, $mask
200 faesdecx %f2, %f10, %f0
201 faesdecx %f4, %f12, %f2
202 ldd [%o7 + $tmp], %f14 ! shift right params
205 faesdeclx %f2, %f6, %f0
206 faesdeclx %f4, %f8, %f2
208 bnz,pn %icc, .Ldec_out_unaligned
219 fshiftorx %f0, %f0, %f14, %f4
220 fshiftorx %f0, %f2, %f14, %f6
221 fshiftorx %f2, %f2, %f14, %f8
223 stda %f4, [$out + $mask]0xc0 ! partial store
225 stda %f8, [$inp + $tmp]0xc0 ! partial store
228 .type aes_fx_decrypt,#function
229 .size aes_fx_decrypt,.-aes_fx_decrypt
233 my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5));
235 .globl aes_fx_set_decrypt_key
237 aes_fx_set_decrypt_key:
242 .type aes_fx_set_decrypt_key,#function
243 .size aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key
245 .globl aes_fx_set_encrypt_key
247 aes_fx_set_encrypt_key:
257 add %o7, .Linp_align-1b, %o7
259 ldd [%o7 + $tmp], %f10 ! shift left params
269 brz,pt $tmp, .L256aligned
273 fshiftorx %f0, %f2, %f10, %f0
274 fshiftorx %f2, %f4, %f10, %f2
275 fshiftorx %f4, %f6, %f10, %f4
276 fshiftorx %f6, %f8, %f10, %f6
280 and $inc, `14*16`, $tmp
281 st $bits, [$out + 240] ! store rounds
282 add $out, $tmp, $out ! start or end of key schedule
283 sllx $inc, 4, $inc ! 16 or -16
285 for ($i=0; $i<6; $i++) {
288 faeskeyx %f6, `0x10+$i`, %f0
291 faeskeyx %f0, 0x00, %f2
293 faeskeyx %f2, 0x01, %f4
296 faeskeyx %f4, 0x00, %f6
301 faeskeyx %f6, `0x10+$i`, %f0
304 faeskeyx %f0, 0x00, %f2
311 xor %o0, %o0, %o0 ! return 0
315 brz,pt $tmp, .L192aligned
319 fshiftorx %f0, %f2, %f10, %f0
320 fshiftorx %f2, %f4, %f10, %f2
321 fshiftorx %f4, %f6, %f10, %f4
325 and $inc, `12*16`, $tmp
326 st $bits, [$out + 240] ! store rounds
327 add $out, $tmp, $out ! start or end of key schedule
328 sllx $inc, 4, $inc ! 16 or -16
330 for ($i=0; $i<8; $i+=2) {
333 faeskeyx %f4, `0x10+$i`, %f0
336 faeskeyx %f0, 0x00, %f2
338 faeskeyx %f2, 0x00, %f4
341 faeskeyx %f4, `0x10+$i+1`, %f0
343 faeskeyx %f0, 0x00, %f2
347 $code.=<<___ if ($i<6);
348 faeskeyx %f2, 0x00, %f4
355 xor %o0, %o0, %o0 ! return 0
359 brz,pt $tmp, .L128aligned
363 fshiftorx %f0, %f2, %f10, %f0
364 fshiftorx %f2, %f4, %f10, %f2
368 and $inc, `10*16`, $tmp
369 st $bits, [$out + 240] ! store rounds
370 add $out, $tmp, $out ! start or end of key schedule
371 sllx $inc, 4, $inc ! 16 or -16
373 for ($i=0; $i<10; $i++) {
376 faeskeyx %f2, `0x10+$i`, %f0
379 faeskeyx %f0, 0x00, %f2
386 xor %o0, %o0, %o0 ! return 0
387 .type aes_fx_set_encrypt_key,#function
388 .size aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key
392 my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5));
393 my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
394 my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
395 = map("%f$_",grep { !($_ & 1) } (16 .. 62));
396 my ($ileft,$iright) = ($ialign,$oalign);
399 .globl aes_fx_cbc_encrypt
402 save %sp, -STACK_FRAME-16, %sp
406 brz,pn $len, .Lcbc_no_data
407 sll $ialign, 3, $ileft
410 add %o7, .Linp_align-1b, %o7
412 ld [$key + 240], $rounds
414 ld [$ivp + 0], %f0 ! load ivec
417 sll $oalign, 3, $mask
421 sll $rounds, 4, $rounds
422 add $rounds, $key, $end
423 ldd [$key + 0], $r0hi ! round[0]
424 ldd [$key + 8], $r0lo
428 ldd [$end + 0], $rlhi ! round[last]
429 ldd [$end + 8], $rllo
433 ldd [$key + 16], %f10 ! round[1]
434 ldd [$key + 24], %f12
436 ldd [%o7 + $ileft], $fshift ! shift left params
438 ldd [$inp - 16], $in0 ! load input
440 ldda [$inp]0x82, $intail ! non-faulting load
441 brz $dir, .Lcbc_decrypt
442 add $inp, $inc, $inp ! inp+=16
444 fxor $r0hi, %f0, %f0 ! ivec^=round[0]
446 fshiftorx $in0, $in1, $fshift, $in0
447 fshiftorx $in1, $intail, $fshift, $in1
451 fxor $in0, %f0, %f0 ! inp^ivec^round[0]
453 ldd [$key + 32], %f6 ! round[2]
456 sub $rounds, 16*6, $inner
460 faesencx %f2, %f10, %f0
461 faesencx %f4, %f12, %f2
462 ldd [$end + 16], %f10
463 ldd [$end + 24], %f12
467 faesencx %f2, %f6, %f0
468 faesencx %f4, %f8, %f2
472 brnz,a $inner, .Lcbc_enc
473 sub $inner, 16*2, $inner
476 faesencx %f2, %f10, %f0
477 faesencx %f4, %f12, %f2
478 ldd [$end + 16], %f10 ! round[last-1]
479 ldd [$end + 24], %f12
483 ldd [$inp - 8], $in1 ! load next input block
484 ldda [$inp]0x82, $intail ! non-faulting load
485 add $inp, $inc, $inp ! inp+=16
488 faesencx %f2, %f6, %f0
489 faesencx %f4, %f8, %f2
491 fshiftorx $in0, $in1, $fshift, $in0
492 fshiftorx $in1, $intail, $fshift, $in1
495 faesencx %f2, %f10, %f0
496 faesencx %f4, %f12, %f2
497 ldd [$key + 16], %f10 ! round[1]
498 ldd [$key + 24], %f12
500 fxor $r0hi, $in0, $in0 ! inp^=round[0]
501 fxor $r0lo, $in1, $in1
504 faesenclx %f2, $rlhi, %f0
505 faesenclx %f4, $rllo, %f2
507 brnz,pn $oalign, .Lcbc_enc_unaligned_out
514 brnz,a $len, .Loop_cbc_enc
517 st %f0, [$ivp + 0] ! output ivec
527 .Lcbc_enc_unaligned_out:
528 ldd [%o7 + $mask], $fshift ! shift right params
530 srl $mask, $oalign, $mask
531 sub %g0, $ileft, $iright
533 fshiftorx %f0, %f0, $fshift, %f6
534 fshiftorx %f0, %f2, $fshift, %f8
536 stda %f6, [$out + $mask]0xc0 ! partial store
537 orn %g0, $mask, $mask
540 brz $len, .Lcbc_enc_unaligned_out_done
542 b .Loop_cbc_enc_unaligned_out
546 .Loop_cbc_enc_unaligned_out:
548 fxor $in0, %f0, %f0 ! inp^ivec^round[0]
550 ldd [$key + 32], %f6 ! round[2]
554 faesencx %f2, %f10, %f0
555 faesencx %f4, %f12, %f2
556 ldd [$key + 48], %f10 ! round[3]
557 ldd [$key + 56], %f12
561 brz $ileft, .Lcbc_enc_aligned_inp
565 sllx %o0, $ileft, %o0
566 srlx %o1, $iright, %g1
567 sllx %o1, $ileft, %o1
569 srlx %o2, $iright, %o2
572 .Lcbc_enc_aligned_inp:
574 faesencx %f2, %f6, %f0
575 faesencx %f4, %f8, %f2
576 ldd [$key + 64], %f6 ! round[4]
579 sub $rounds, 16*8, $inner
581 stx %o0, [%sp + LOCALS + 0]
582 stx %o1, [%sp + LOCALS + 8]
583 add $inp, $inc, $inp ! inp+=16
588 faesencx %f2, %f10, %f0
589 faesencx %f4, %f12, %f2
590 ldd [$end + 16], %f10
591 ldd [$end + 24], %f12
595 faesencx %f2, %f6, %f0
596 faesencx %f4, %f8, %f2
600 brnz,a $inner, .Lcbc_enc_unaligned
601 sub $inner, 16*2, $inner
604 faesencx %f2, %f10, %f0
605 faesencx %f4, %f12, %f2
606 ldd [$end + 16], %f10 ! round[last-1]
607 ldd [$end + 24], %f12
610 faesencx %f2, %f6, %f0
611 faesencx %f4, %f8, %f2
613 ldd [%sp + LOCALS + 0], $in0
614 ldd [%sp + LOCALS + 8], $in1
617 faesencx %f2, %f10, %f0
618 faesencx %f4, %f12, %f2
619 ldd [$key + 16], %f10 ! round[1]
620 ldd [$key + 24], %f12
622 fxor $r0hi, $in0, $in0 ! inp^=round[0]
623 fxor $r0lo, $in1, $in1
626 faesenclx %f2, $rlhi, %f0
627 faesenclx %f4, $rllo, %f2
629 fshiftorx $outhead, %f0, $fshift, %f6
630 fshiftorx %f0, %f2, $fshift, %f8
635 brnz,a $len, .Loop_cbc_enc_unaligned_out
638 .Lcbc_enc_unaligned_out_done:
639 fshiftorx %f2, %f2, $fshift, %f8
640 stda %f8, [$out + $mask]0xc0 ! partial store
642 st %f0, [$ivp + 0] ! output ivec
652 fshiftorx $in0, $in1, $fshift, $in0
653 fshiftorx $in1, $intail, $fshift, $in1
658 fxor $in0, $r0hi, %f0 ! inp^round[0]
659 fxor $in1, $r0lo, %f2
660 ldd [$key + 32], %f6 ! round[2]
663 sub $rounds, 16*6, $inner
667 faesdecx %f2, %f10, %f0
668 faesdecx %f4, %f12, %f2
669 ldd [$end + 16], %f10
670 ldd [$end + 24], %f12
674 faesdecx %f2, %f6, %f0
675 faesdecx %f4, %f8, %f2
679 brnz,a $inner, .Lcbc_dec
680 sub $inner, 16*2, $inner
683 faesdecx %f2, %f10, %f0
684 faesdecx %f4, %f12, %f2
685 ldd [$end + 16], %f10 ! round[last-1]
686 ldd [$end + 24], %f12
689 faesdecx %f2, %f6, %f0
690 faesdecx %f4, %f8, %f2
691 fxor $iv0, $rlhi, %f6 ! ivec^round[last]
692 fxor $iv1, $rllo, %f8
698 ldd [$inp - 8], $in1 ! load next input block
699 ldda [$inp]0x82, $intail ! non-faulting load
700 add $inp, $inc, $inp ! inp+=16
703 faesdecx %f2, %f10, %f0
704 faesdecx %f4, %f12, %f2
705 ldd [$key + 16], %f10 ! round[1]
706 ldd [$key + 24], %f12
708 fshiftorx $in0, $in1, $fshift, $in0
709 fshiftorx $in1, $intail, $fshift, $in1
712 faesdeclx %f2, %f6, %f0
713 faesdeclx %f4, %f8, %f2
715 brnz,pn $oalign, .Lcbc_dec_unaligned_out
722 brnz,a $len, .Loop_cbc_dec
725 st $iv0, [$ivp + 0] ! output ivec
726 st $iv0#lo, [$ivp + 4]
728 st $iv1#lo, [$ivp + 12]
734 .Lcbc_dec_unaligned_out:
735 ldd [%o7 + $mask], $fshift ! shift right params
737 srl $mask, $oalign, $mask
738 sub %g0, $ileft, $iright
740 fshiftorx %f0, %f0, $fshift, %f6
741 fshiftorx %f0, %f2, $fshift, %f8
743 stda %f6, [$out + $mask]0xc0 ! partial store
744 orn %g0, $mask, $mask
747 brz $len, .Lcbc_dec_unaligned_out_done
749 b .Loop_cbc_dec_unaligned_out
753 .Loop_cbc_dec_unaligned_out:
755 fxor $in0, $r0hi, %f0 ! inp^round[0]
756 fxor $in1, $r0lo, %f2
757 ldd [$key + 32], %f6 ! round[2]
761 faesdecx %f2, %f10, %f0
762 faesdecx %f4, %f12, %f2
763 ldd [$key + 48], %f10 ! round[3]
764 ldd [$key + 56], %f12
768 brz $ileft, .Lcbc_dec_aligned_inp
772 sllx %o0, $ileft, %o0
773 srlx %o1, $iright, %g1
774 sllx %o1, $ileft, %o1
776 srlx %o2, $iright, %o2
779 .Lcbc_dec_aligned_inp:
781 faesdecx %f2, %f6, %f0
782 faesdecx %f4, %f8, %f2
783 ldd [$key + 64], %f6 ! round[4]
786 sub $rounds, 16*8, $inner
788 stx %o0, [%sp + LOCALS + 0]
789 stx %o1, [%sp + LOCALS + 8]
790 add $inp, $inc, $inp ! inp+=16
795 faesdecx %f2, %f10, %f0
796 faesdecx %f4, %f12, %f2
797 ldd [$end + 16], %f10
798 ldd [$end + 24], %f12
802 faesdecx %f2, %f6, %f0
803 faesdecx %f4, %f8, %f2
807 brnz,a $inner, .Lcbc_dec_unaligned
808 sub $inner, 16*2, $inner
811 faesdecx %f2, %f10, %f0
812 faesdecx %f4, %f12, %f2
813 ldd [$end + 16], %f10 ! round[last-1]
814 ldd [$end + 24], %f12
817 faesdecx %f2, %f6, %f0
818 faesdecx %f4, %f8, %f2
820 fxor $iv0, $rlhi, %f6 ! ivec^round[last]
821 fxor $iv1, $rllo, %f8
824 ldd [%sp + LOCALS + 0], $in0
825 ldd [%sp + LOCALS + 8], $in1
828 faesdecx %f2, %f10, %f0
829 faesdecx %f4, %f12, %f2
830 ldd [$key + 16], %f10 ! round[1]
831 ldd [$key + 24], %f12
834 faesdeclx %f2, %f6, %f0
835 faesdeclx %f4, %f8, %f2
837 fshiftorx $outhead, %f0, $fshift, %f6
838 fshiftorx %f0, %f2, $fshift, %f8
843 brnz,a $len, .Loop_cbc_dec_unaligned_out
846 .Lcbc_dec_unaligned_out_done:
847 fshiftorx %f2, %f2, $fshift, %f8
848 stda %f8, [$out + $mask]0xc0 ! partial store
850 st $iv0, [$ivp + 0] ! output ivec
851 st $iv0#lo, [$ivp + 4]
853 st $iv1#lo, [$ivp + 12]
857 .type aes_fx_cbc_encrypt,#function
858 .size aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt
862 my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5));
863 my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
864 my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
865 = map("%f$_",grep { !($_ & 1) } (16 .. 62));
866 my ($ileft,$iright) = ($ialign, $oalign);
870 .globl aes_fx_ctr32_encrypt_blocks
872 aes_fx_ctr32_encrypt_blocks:
873 save %sp, -STACK_FRAME-16, %sp
877 brz,pn $len, .Lctr32_no_data
878 sll $ialign, 3, $ileft
881 add %o7, .Linp_align - .Lpic, %o7
883 ld [$key + 240], $rounds
885 ld [$ivp + 0], $ctr0 ! load counter
887 ld [$ivp + 4], $ctr0#lo
888 sll $oalign, 3, $mask
890 ld [$ivp + 12], $ctr1#lo
891 ldd [%o7 + 128], $one
893 sll $rounds, 4, $rounds
894 add $rounds, $key, $end
895 ldd [$key + 0], $r0hi ! round[0]
896 ldd [$key + 8], $r0lo
900 ldd [$key + 16], %f10 ! round[1]
901 ldd [$key + 24], %f12
905 ldd [$end + 0], $rlhi ! round[last]
906 ldd [$end + 8], $rllo
908 ldd [%o7 + $ileft], $fshift ! shiftleft params
910 ldd [$inp - 16], $in0 ! load input
912 ldda [$inp]0x82, $intail ! non-faulting load
913 add $inp, $inc, $inp ! inp+=16
915 fshiftorx $in0, $in1, $fshift, $in0
916 fshiftorx $in1, $intail, $fshift, $in1
919 fxor $ctr0, $r0hi, %f0 ! counter^round[0]
920 fxor $ctr1, $r0lo, %f2
921 ldd [$key + 32], %f6 ! round[2]
924 sub $rounds, 16*6, $inner
928 faesencx %f2, %f10, %f0
929 faesencx %f4, %f12, %f2
930 ldd [$end + 16], %f10
931 ldd [$end + 24], %f12
935 faesencx %f2, %f6, %f0
936 faesencx %f4, %f8, %f2
940 brnz,a $inner, .Lctr32_enc
941 sub $inner, 16*2, $inner
944 faesencx %f2, %f10, %f0
945 faesencx %f4, %f12, %f2
946 ldd [$end + 16], %f10 ! round[last-1]
947 ldd [$end + 24], %f12
950 faesencx %f2, %f6, %f0
951 faesencx %f4, %f8, %f2
952 fxor $in0, $rlhi, %f6 ! inp^round[last]
953 fxor $in1, $rllo, %f8
957 ldd [$inp - 8], $in1 ! load next input block
958 ldda [$inp]0x82, $intail ! non-faulting load
959 add $inp, $inc, $inp ! inp+=16
962 faesencx %f2, %f10, %f0
963 faesencx %f4, %f12, %f2
964 ldd [$key + 16], %f10 ! round[1]
965 ldd [$key + 24], %f12
967 fshiftorx $in0, $in1, $fshift, $in0
968 fshiftorx $in1, $intail, $fshift, $in1
969 fpadd32 $ctr1, $one, $ctr1 ! increment counter
972 faesenclx %f2, %f6, %f0
973 faesenclx %f4, %f8, %f2
975 brnz,pn $oalign, .Lctr32_unaligned_out
982 brnz,a $len, .Loop_ctr32
990 .Lctr32_unaligned_out:
991 ldd [%o7 + $mask], $fshift ! shift right params
993 srl $mask, $oalign, $mask
994 sub %g0, $ileft, $iright
996 fshiftorx %f0, %f0, $fshift, %f6
997 fshiftorx %f0, %f2, $fshift, %f8
999 stda %f6, [$out + $mask]0xc0 ! partial store
1000 orn %g0, $mask, $mask
1003 brz $len, .Lctr32_unaligned_out_done
1005 b .Loop_ctr32_unaligned_out
1009 .Loop_ctr32_unaligned_out:
1011 fxor $ctr0, $r0hi, %f0 ! counter^round[0]
1012 fxor $ctr1, $r0lo, %f2
1013 ldd [$key + 32], %f6 ! round[2]
1014 ldd [$key + 40], %f8
1017 faesencx %f2, %f10, %f0
1018 faesencx %f4, %f12, %f2
1019 ldd [$key + 48], %f10 ! round[3]
1020 ldd [$key + 56], %f12
1022 ldx [$inp - 16], %o0
1024 brz $ileft, .Lctr32_aligned_inp
1028 sllx %o0, $ileft, %o0
1029 srlx %o1, $iright, %g1
1030 sllx %o1, $ileft, %o1
1032 srlx %o2, $iright, %o2
1035 .Lctr32_aligned_inp:
1037 faesencx %f2, %f6, %f0
1038 faesencx %f4, %f8, %f2
1039 ldd [$key + 64], %f6 ! round[4]
1040 ldd [$key + 72], %f8
1042 sub $rounds, 16*8, $inner
1044 stx %o0, [%sp + LOCALS + 0]
1045 stx %o1, [%sp + LOCALS + 8]
1046 add $inp, $inc, $inp ! inp+=16
1049 .Lctr32_enc_unaligned:
1051 faesencx %f2, %f10, %f0
1052 faesencx %f4, %f12, %f2
1053 ldd [$end + 16], %f10
1054 ldd [$end + 24], %f12
1058 faesencx %f2, %f6, %f0
1059 faesencx %f4, %f8, %f2
1063 brnz,a $inner, .Lctr32_enc_unaligned
1064 sub $inner, 16*2, $inner
1067 faesencx %f2, %f10, %f0
1068 faesencx %f4, %f12, %f2
1069 ldd [$end + 16], %f10 ! round[last-1]
1070 ldd [$end + 24], %f12
1071 fpadd32 $ctr1, $one, $ctr1 ! increment counter
1074 faesencx %f2, %f6, %f0
1075 faesencx %f4, %f8, %f2
1076 fxor $in0, $rlhi, %f6 ! inp^round[last]
1077 fxor $in1, $rllo, %f8
1078 ldd [%sp + LOCALS + 0], $in0
1079 ldd [%sp + LOCALS + 8], $in1
1082 faesencx %f2, %f10, %f0
1083 faesencx %f4, %f12, %f2
1084 ldd [$key + 16], %f10 ! round[1]
1085 ldd [$key + 24], %f12
1088 faesenclx %f2, %f6, %f0
1089 faesenclx %f4, %f8, %f2
1091 fshiftorx $outhead, %f0, $fshift, %f6
1092 fshiftorx %f0, %f2, $fshift, %f8
1097 brnz,a $len, .Loop_ctr32_unaligned_out
1100 .Lctr32_unaligned_out_done:
1101 fshiftorx %f2, %f2, $fshift, %f8
1102 stda %f8, [$out + $mask]0xc0 ! partial store
1106 .type aes_fx_ctr32_encrypt_blocks,#function
1107 .size aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks
1110 .Linp_align: ! fshiftorx parameters for left shift toward %rs1
1111 .byte 0, 0, 64, 0, 0, 64, 0, -64
1112 .byte 0, 0, 56, 8, 0, 56, 8, -56
1113 .byte 0, 0, 48, 16, 0, 48, 16, -48
1114 .byte 0, 0, 40, 24, 0, 40, 24, -40
1115 .byte 0, 0, 32, 32, 0, 32, 32, -32
1116 .byte 0, 0, 24, 40, 0, 24, 40, -24
1117 .byte 0, 0, 16, 48, 0, 16, 48, -16
1118 .byte 0, 0, 8, 56, 0, 8, 56, -8
1119 .Lout_align: ! fshiftorx parameters for right shift toward %rs2
1120 .byte 0, 0, 0, 64, 0, 0, 64, 0
1121 .byte 0, 0, 8, 56, 0, 8, 56, -8
1122 .byte 0, 0, 16, 48, 0, 16, 48, -16
1123 .byte 0, 0, 24, 40, 0, 24, 40, -24
1124 .byte 0, 0, 32, 32, 0, 32, 32, -32
1125 .byte 0, 0, 40, 24, 0, 40, 24, -40
1126 .byte 0, 0, 48, 16, 0, 48, 16, -48
1127 .byte 0, 0, 56, 8, 0, 56, 8, -56
1130 .asciz "AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>"
1134 # Purpose of these subroutines is to explicitly encode VIS instructions,
1135 # so that one can compile the module without having to specify VIS
1136 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1137 # Idea is to reserve for option to produce "universal" binary and let
1138 # programmer detect if current CPU is VIS capable at run-time.
1140 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1142 my %visopf = ( "faligndata" => 0x048,
1143 "bshuffle" => 0x04c,
1148 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1150 if ($opf=$visopf{$mnemonic}) {
1151 foreach ($rs1,$rs2,$rd) {
1152 return $ref if (!/%f([0-9]{1,2})/);
1155 return $ref if ($1&1);
1156 # re-encode for upper double register addressing
1161 return sprintf ".word\t0x%08x !%s",
1162 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1170 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1171 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1173 my %visopf = ( "alignaddr" => 0x018,
1175 "alignaddrl" => 0x01a );
1177 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1179 if ($opf=$visopf{$mnemonic}) {
1180 foreach ($rs1,$rs2,$rd) {
1181 return $ref if (!/%([goli])([0-9])/);
1185 return sprintf ".word\t0x%08x !%s",
1186 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1194 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1196 my %aesopf = ( "faesencx" => 0x90,
1198 "faesenclx" => 0x92,
1199 "faesdeclx" => 0x93,
1200 "faeskeyx" => 0x94 );
1202 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1204 if (defined($opf=$aesopf{$mnemonic})) {
1205 $rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2;
1206 $rs2 = oct($rs2) if ($rs2 =~ /^0/);
1208 foreach ($rs1,$rd) {
1209 return $ref if (!/%f([0-9]{1,2})/);
1212 return $ref if ($1&1);
1213 # re-encode for upper double register addressing
1218 return sprintf ".word\t0x%08x !%s",
1219 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1227 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1229 my %aesopf = ( "fshiftorx" => 0x0b );
1231 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1233 if (defined($opf=$aesopf{$mnemonic})) {
1234 foreach ($rs1,$rs2,$rs3,$rd) {
1235 return $ref if (!/%f([0-9]{1,2})/);
1238 return $ref if ($1&1);
1239 # re-encode for upper double register addressing
1244 return sprintf ".word\t0x%08x !%s",
1245 2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1252 foreach (split("\n",$code)) {
1253 s/\`([^\`]*)\`/eval $1/ge;
1255 s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge;
1257 s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1260 s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1261 &unfx3src($1,$2,$3,$4,$5)
1263 s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1266 s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1267 &unvis3($1,$2,$3,$4)
1272 close STDOUT or die "error closing STDOUT: $!";