3 # Specific modes implementations for SPARC Architecture 2011. There
4 # is T4 dependency though, an ASI value that is not specified in the
5 # Architecture Manual. But as SPARC universe is rather monocultural,
6 # we imply that processor capable of executing crypto instructions
7 # can handle the ASI in question as well. This means that we ought to
8 # keep eyes open when new processors emerge...
10 # As for above mentioned ASI. It's so called "block initializing
11 # store" which cancels "read" in "read-update-write" on cache lines.
12 # This is "cooperative" optimization, as it reduces overall pressure
13 # on memory interface. Benefits can't be observed/quantified with
14 # usual benchmarks, on the contrary you can notice that single-thread
15 # performance for parallelizable modes is ~1.5% worse for largest
16 # block sizes [though few percent better for not so long ones]. All
17 # this based on suggestions from David Miller.
19 sub asm_init { # to be called with @ARGV as argument
20 for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
21 if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
22 else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; }
26 my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
28 my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
30 sub alg_cbc_encrypt_implement {
34 .globl ${alg}${bits}_t4_cbc_encrypt
36 ${alg}${bits}_t4_cbc_encrypt:
37 save %sp, -$::frame, %sp
38 sub $inp, $out, $blk_init ! $inp!=$out
40 $::code.=<<___ if (!$::evp);
41 andcc $ivec, 7, $ivoff
42 alignaddr $ivec, %g0, $ivec
44 ldd [$ivec + 0], %f0 ! load ivec
48 faligndata %f0, %f2, %f0
49 faligndata %f2, %f4, %f2
52 $::code.=<<___ if ($::evp);
60 prefetch [$inp + 63], 20
61 call _${alg}${bits}_load_enckey
67 sub $iright, $ileft, $iright
70 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
71 movleu $::size_t_cc, 0, $blk_init ! $len<128 ||
72 brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out)
73 srl $omask, $ooff, $omask
75 alignaddrl $out, %g0, $out
79 .L${bits}_cbc_enc_loop:
86 srlx %o1, $iright, %g1
89 srlx %o2, $iright, %o2
92 xor %g4, %o0, %o0 ! ^= rk[0]
97 fxor %f12, %f0, %f0 ! ^= ivec
99 prefetch [$out + 63], 22
100 prefetch [$inp + 16+63], 20
101 call _${alg}${bits}_encrypt_1x
109 brnz,pt $len, .L${bits}_cbc_enc_loop
112 $::code.=<<___ if ($::evp);
118 $::code.=<<___ if (!$::evp);
122 std %f0, [$ivec + 0] ! write out ivec
130 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
131 ! and ~3x deterioration
133 faligndata %f0, %f0, %f4 ! handle unaligned output
134 faligndata %f0, %f2, %f6
135 faligndata %f2, %f2, %f8
137 stda %f4, [$out + $omask]0xc0 ! partial store
140 orn %g0, $omask, $omask
141 stda %f8, [$out + $omask]0xc0 ! partial store
143 brnz,pt $len, .L${bits}_cbc_enc_loop+4
144 orn %g0, $omask, $omask
146 $::code.=<<___ if ($::evp);
152 $::code.=<<___ if (!$::evp);
156 std %f0, [$ivec + 0] ! write out ivec
162 3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
164 srl $omask, $ivoff, $omask
165 faligndata %f0, %f0, %f4
166 faligndata %f0, %f2, %f6
167 faligndata %f2, %f2, %f8
168 stda %f4, [$ivec + $omask]0xc0
171 orn %g0, $omask, $omask
172 stda %f8, [$ivec + $omask]0xc0
178 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
180 .L${bits}cbc_enc_blk:
181 add $out, $len, $blk_init
182 and $blk_init, 63, $blk_init ! tail
183 sub $len, $blk_init, $len
184 add $blk_init, 15, $blk_init ! round up to 16n
186 srl $blk_init, 4, $blk_init
188 .L${bits}_cbc_enc_blk_loop:
194 sllx %o0, $ileft, %o0
195 srlx %o1, $iright, %g1
196 sllx %o1, $ileft, %o1
198 srlx %o2, $iright, %o2
201 xor %g4, %o0, %o0 ! ^= rk[0]
206 fxor %f12, %f0, %f0 ! ^= ivec
208 prefetch [$inp + 16+63], 20
209 call _${alg}${bits}_encrypt_1x
213 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
215 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
216 brnz,pt $len, .L${bits}_cbc_enc_blk_loop
219 membar #StoreLoad|#StoreStore
220 brnz,pt $blk_init, .L${bits}_cbc_enc_loop
223 $::code.=<<___ if ($::evp);
229 $::code.=<<___ if (!$::evp);
233 std %f0, [$ivec + 0] ! write out ivec
239 .type ${alg}${bits}_t4_cbc_encrypt,#function
240 .size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
244 sub alg_cbc_decrypt_implement {
245 my ($alg,$bits) = @_;
248 .globl ${alg}${bits}_t4_cbc_decrypt
250 ${alg}${bits}_t4_cbc_decrypt:
251 save %sp, -$::frame, %sp
252 sub $inp, $out, $blk_init ! $inp!=$out
254 $::code.=<<___ if (!$::evp);
255 andcc $ivec, 7, $ivoff
256 alignaddr $ivec, %g0, $ivec
258 ldd [$ivec + 0], %f12 ! load ivec
260 ldd [$ivec + 8], %f14
261 ldd [$ivec + 16], %f0
262 faligndata %f12, %f14, %f12
263 faligndata %f14, %f0, %f14
266 $::code.=<<___ if ($::evp);
267 ld [$ivec + 0], %f12 ! load ivec
270 ld [$ivec + 12], %f15
274 prefetch [$inp + 63], 20
275 call _${alg}${bits}_load_deckey
278 sll $ileft, 3, $ileft
281 sub $iright, $ileft, $iright
284 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
285 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
286 brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out)
287 srl $omask, $ooff, $omask
289 andcc $len, 16, %g0 ! is number of blocks even?
291 alignaddrl $out, %g0, $out
292 bz %icc, .L${bits}_cbc_dec_loop2x
294 .L${bits}_cbc_dec_loop:
300 sllx %o0, $ileft, %o0
301 srlx %o1, $iright, %g1
302 sllx %o1, $ileft, %o1
304 srlx %o2, $iright, %o2
307 xor %g4, %o0, %o2 ! ^= rk[0]
312 prefetch [$out + 63], 22
313 prefetch [$inp + 16+63], 20
314 call _${alg}${bits}_decrypt_1x
317 fxor %f12, %f0, %f0 ! ^= ivec
327 brnz,pt $len, .L${bits}_cbc_dec_loop2x
330 $::code.=<<___ if ($::evp);
334 st %f15, [$ivec + 12]
336 $::code.=<<___ if (!$::evp);
337 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
340 std %f12, [$ivec + 0] ! write out ivec
341 std %f14, [$ivec + 8]
348 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
349 ! and ~3x deterioration
351 faligndata %f0, %f0, %f4 ! handle unaligned output
352 faligndata %f0, %f2, %f6
353 faligndata %f2, %f2, %f8
355 stda %f4, [$out + $omask]0xc0 ! partial store
358 orn %g0, $omask, $omask
359 stda %f8, [$out + $omask]0xc0 ! partial store
361 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
362 orn %g0, $omask, $omask
364 $::code.=<<___ if ($::evp);
368 st %f15, [$ivec + 12]
370 $::code.=<<___ if (!$::evp);
371 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
374 std %f12, [$ivec + 0] ! write out ivec
375 std %f14, [$ivec + 8]
381 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
383 .L${bits}_cbc_dec_loop2x:
391 sllx %o0, $ileft, %o0
392 srlx %o1, $iright, %g1
394 sllx %o1, $ileft, %o1
395 srlx %o2, $iright, %g1
397 sllx %o2, $ileft, %o2
398 srlx %o3, $iright, %g1
400 sllx %o3, $ileft, %o3
401 srlx %o4, $iright, %o4
404 xor %g4, %o0, %o4 ! ^= rk[0]
413 prefetch [$out + 63], 22
414 prefetch [$inp + 32+63], 20
415 call _${alg}${bits}_decrypt_2x
420 fxor %f12, %f0, %f0 ! ^= ivec
434 brnz,pt $len, .L${bits}_cbc_dec_loop2x
437 $::code.=<<___ if ($::evp);
441 st %f15, [$ivec + 12]
443 $::code.=<<___ if (!$::evp);
444 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
447 std %f12, [$ivec + 0] ! write out ivec
448 std %f14, [$ivec + 8]
455 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
456 ! and ~3x deterioration
458 faligndata %f0, %f0, %f8 ! handle unaligned output
459 faligndata %f0, %f2, %f0
460 faligndata %f2, %f4, %f2
461 faligndata %f4, %f6, %f4
462 faligndata %f6, %f6, %f6
463 stda %f8, [$out + $omask]0xc0 ! partial store
468 orn %g0, $omask, $omask
469 stda %f6, [$out + $omask]0xc0 ! partial store
471 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
472 orn %g0, $omask, $omask
474 $::code.=<<___ if ($::evp);
478 st %f15, [$ivec + 12]
480 $::code.=<<___ if (!$::evp);
481 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
484 std %f12, [$ivec + 0] ! write out ivec
485 std %f14, [$ivec + 8]
490 .L${bits}_cbc_dec_unaligned_ivec:
491 alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
493 srl $omask, $ivoff, $omask
494 faligndata %f12, %f12, %f0
495 faligndata %f12, %f14, %f2
496 faligndata %f14, %f14, %f4
497 stda %f0, [$ivec + $omask]0xc0
500 orn %g0, $omask, $omask
501 stda %f4, [$ivec + $omask]0xc0
507 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
509 .L${bits}cbc_dec_blk:
510 add $out, $len, $blk_init
511 and $blk_init, 63, $blk_init ! tail
512 sub $len, $blk_init, $len
513 add $blk_init, 15, $blk_init ! round up to 16n
515 srl $blk_init, 4, $blk_init
517 add $blk_init, 1, $blk_init
519 .L${bits}_cbc_dec_blk_loop2x:
527 sllx %o0, $ileft, %o0
528 srlx %o1, $iright, %g1
530 sllx %o1, $ileft, %o1
531 srlx %o2, $iright, %g1
533 sllx %o2, $ileft, %o2
534 srlx %o3, $iright, %g1
536 sllx %o3, $ileft, %o3
537 srlx %o4, $iright, %o4
540 xor %g4, %o0, %o4 ! ^= rk[0]
549 prefetch [$inp + 32+63], 20
550 call _${alg}${bits}_decrypt_2x
556 fxor %f12, %f0, %f0 ! ^= ivec
563 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
565 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
567 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
569 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
570 bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
573 add $blk_init, $len, $len
574 andcc $len, 1, %g0 ! is number of blocks even?
575 membar #StoreLoad|#StoreStore
576 bnz,pt %icc, .L${bits}_cbc_dec_loop
578 brnz,pn $len, .L${bits}_cbc_dec_loop2x
581 $::code.=<<___ if ($::evp);
582 st %f12, [$ivec + 0] ! write out ivec
585 st %f15, [$ivec + 12]
587 $::code.=<<___ if (!$::evp);
591 std %f12, [$ivec + 0] ! write out ivec
592 std %f14, [$ivec + 8]
597 .type ${alg}${bits}_t4_cbc_decrypt,#function
598 .size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
602 sub alg_ctr32_implement {
603 my ($alg,$bits) = @_;
606 .globl ${alg}${bits}_t4_ctr32_encrypt
608 ${alg}${bits}_t4_ctr32_encrypt:
609 save %sp, -$::frame, %sp
612 prefetch [$inp + 63], 20
613 call _${alg}${bits}_load_enckey
616 ld [$ivec + 0], %l4 ! counter
624 xor %o5, %g4, %g4 ! ^= rk[0]
626 movxtod %g4, %f14 ! most significant 64 bits
628 sub $inp, $out, $blk_init ! $inp!=$out
631 sll $ileft, 3, $ileft
634 sub $iright, $ileft, $iright
637 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
638 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
639 brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out)
640 srl $omask, $ooff, $omask
642 andcc $len, 16, %g0 ! is number of blocks even?
643 alignaddrl $out, %g0, $out
644 bz %icc, .L${bits}_ctr32_loop2x
646 .L${bits}_ctr32_loop:
652 sllx %o0, $ileft, %o0
653 srlx %o1, $iright, %g1
654 sllx %o1, $ileft, %o1
656 srlx %o2, $iright, %o2
659 xor %g5, %l7, %g1 ! ^= rk[0]
662 srl %l7, 0, %l7 ! clruw
663 prefetch [$out + 63], 22
664 prefetch [$inp + 16+63], 20
666 $::code.=<<___ if ($alg eq "aes");
667 aes_eround01 %f16, %f14, %f2, %f4
668 aes_eround23 %f18, %f14, %f2, %f2
670 $::code.=<<___ if ($alg eq "cmll");
671 camellia_f %f16, %f2, %f14, %f2
672 camellia_f %f18, %f14, %f2, %f0
675 call _${alg}${bits}_encrypt_1x+8
680 fxor %f10, %f0, %f0 ! ^= inp
688 brnz,pt $len, .L${bits}_ctr32_loop2x
695 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
696 ! and ~3x deterioration
698 faligndata %f0, %f0, %f4 ! handle unaligned output
699 faligndata %f0, %f2, %f6
700 faligndata %f2, %f2, %f8
701 stda %f4, [$out + $omask]0xc0 ! partial store
704 orn %g0, $omask, $omask
705 stda %f8, [$out + $omask]0xc0 ! partial store
707 brnz,pt $len, .L${bits}_ctr32_loop2x+4
708 orn %g0, $omask, $omask
713 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
715 .L${bits}_ctr32_loop2x:
723 sllx %o0, $ileft, %o0
724 srlx %o1, $iright, %g1
726 sllx %o1, $ileft, %o1
727 srlx %o2, $iright, %g1
729 sllx %o2, $ileft, %o2
730 srlx %o3, $iright, %g1
732 sllx %o3, $ileft, %o3
733 srlx %o4, $iright, %o4
736 xor %g5, %l7, %g1 ! ^= rk[0]
739 srl %l7, 0, %l7 ! clruw
743 srl %l7, 0, %l7 ! clruw
744 prefetch [$out + 63], 22
745 prefetch [$inp + 32+63], 20
747 $::code.=<<___ if ($alg eq "aes");
748 aes_eround01 %f16, %f14, %f2, %f8
749 aes_eround23 %f18, %f14, %f2, %f2
750 aes_eround01 %f16, %f14, %f6, %f10
751 aes_eround23 %f18, %f14, %f6, %f6
753 $::code.=<<___ if ($alg eq "cmll");
754 camellia_f %f16, %f2, %f14, %f2
755 camellia_f %f16, %f6, %f14, %f6
756 camellia_f %f18, %f14, %f2, %f0
757 camellia_f %f18, %f14, %f6, %f4
760 call _${alg}${bits}_encrypt_2x+16
766 fxor %f8, %f0, %f0 ! ^= inp
779 brnz,pt $len, .L${bits}_ctr32_loop2x
786 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
787 ! and ~3x deterioration
789 faligndata %f0, %f0, %f8 ! handle unaligned output
790 faligndata %f0, %f2, %f0
791 faligndata %f2, %f4, %f2
792 faligndata %f4, %f6, %f4
793 faligndata %f6, %f6, %f6
795 stda %f8, [$out + $omask]0xc0 ! partial store
800 orn %g0, $omask, $omask
801 stda %f6, [$out + $omask]0xc0 ! partial store
803 brnz,pt $len, .L${bits}_ctr32_loop2x+4
804 orn %g0, $omask, $omask
809 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
812 add $out, $len, $blk_init
813 and $blk_init, 63, $blk_init ! tail
814 sub $len, $blk_init, $len
815 add $blk_init, 15, $blk_init ! round up to 16n
817 srl $blk_init, 4, $blk_init
819 add $blk_init, 1, $blk_init
821 .L${bits}_ctr32_blk_loop2x:
829 sllx %o0, $ileft, %o0
830 srlx %o1, $iright, %g1
832 sllx %o1, $ileft, %o1
833 srlx %o2, $iright, %g1
835 sllx %o2, $ileft, %o2
836 srlx %o3, $iright, %g1
838 sllx %o3, $ileft, %o3
839 srlx %o4, $iright, %o4
842 xor %g5, %l7, %g1 ! ^= rk[0]
845 srl %l7, 0, %l7 ! clruw
849 srl %l7, 0, %l7 ! clruw
850 prefetch [$inp + 32+63], 20
852 $::code.=<<___ if ($alg eq "aes");
853 aes_eround01 %f16, %f14, %f2, %f8
854 aes_eround23 %f18, %f14, %f2, %f2
855 aes_eround01 %f16, %f14, %f6, %f10
856 aes_eround23 %f18, %f14, %f6, %f6
858 $::code.=<<___ if ($alg eq "cmll");
859 camellia_f %f16, %f2, %f14, %f2
860 camellia_f %f16, %f6, %f14, %f6
861 camellia_f %f18, %f14, %f2, %f0
862 camellia_f %f18, %f14, %f6, %f4
865 call _${alg}${bits}_encrypt_2x+16
872 fxor %f8, %f0, %f0 ! ^= inp
878 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
880 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
882 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
884 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
885 bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x
888 add $blk_init, $len, $len
889 andcc $len, 1, %g0 ! is number of blocks even?
890 membar #StoreLoad|#StoreStore
891 bnz,pt %icc, .L${bits}_ctr32_loop
893 brnz,pn $len, .L${bits}_ctr32_loop2x
898 .type ${alg}${bits}_t4_ctr32_encrypt,#function
899 .size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
903 # Purpose of these subroutines is to explicitly encode VIS instructions,
904 # so that one can compile the module without having to specify VIS
905 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
906 # Idea is to reserve for option to produce "universal" binary and let
907 # programmer detect if current CPU is VIS capable at run-time.
909 my ($mnemonic,$rs1,$rs2,$rd)=@_;
911 my %visopf = ( "faligndata" => 0x048,
916 $ref = "$mnemonic\t$rs1,$rs2,$rd";
918 if ($opf=$visopf{$mnemonic}) {
919 foreach ($rs1,$rs2,$rd) {
920 return $ref if (!/%f([0-9]{1,2})/);
923 return $ref if ($1&1);
924 # re-encode for upper double register addressing
929 return sprintf ".word\t0x%08x !%s",
930 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
937 my ($mnemonic,$rs1,$rs2,$rd)=@_;
938 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
940 my %visopf = ( "addxc" => 0x011,
942 "umulxhi" => 0x016 );
944 $ref = "$mnemonic\t$rs1,$rs2,$rd";
946 if ($opf=$visopf{$mnemonic}) {
947 foreach ($rs1,$rs2,$rd) {
948 return $ref if (!/%([goli])([0-9])/);
952 return sprintf ".word\t0x%08x !%s",
953 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
960 my ($mnemonic,$rs1,$rs2,$rd)=@_;
961 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
962 my $ref = "$mnemonic\t$rs1,$rs2,$rd";
963 my $opf = $mnemonic =~ /l$/ ? 0x01a :0x18;
965 foreach ($rs1,$rs2,$rd) {
966 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
967 else { return $ref; }
969 return sprintf ".word\t0x%08x !%s",
970 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
974 sub unaes_round { # 4-argument instructions
975 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
977 my %aesopf = ( "aes_eround01" => 0,
981 "aes_eround01_l"=> 4,
982 "aes_eround23_l"=> 5,
983 "aes_dround01_l"=> 6,
984 "aes_dround23_l"=> 7,
985 "aes_kexpand1" => 8 );
987 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
989 if (defined($opf=$aesopf{$mnemonic})) {
990 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
991 foreach ($rs1,$rs2,$rd) {
992 return $ref if (!/%f([0-9]{1,2})/);
995 return $ref if ($1&1);
996 # re-encode for upper double register addressing
1001 return sprintf ".word\t0x%08x !%s",
1002 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1009 sub unaes_kexpand { # 3-argument instructions
1010 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1012 my %aesopf = ( "aes_kexpand0" => 0x130,
1013 "aes_kexpand2" => 0x131 );
1015 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1017 if (defined($opf=$aesopf{$mnemonic})) {
1018 foreach ($rs1,$rs2,$rd) {
1019 return $ref if (!/%f([0-9]{1,2})/);
1022 return $ref if ($1&1);
1023 # re-encode for upper double register addressing
1028 return sprintf ".word\t0x%08x !%s",
1029 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1036 sub uncamellia_f { # 4-argument instructions
1037 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1040 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1043 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1044 foreach ($rs1,$rs2,$rd) {
1045 return $ref if (!/%f([0-9]{1,2})/);
1048 return $ref if ($1&1);
1049 # re-encode for upper double register addressing
1054 return sprintf ".word\t0x%08x !%s",
1055 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1062 sub uncamellia3 { # 3-argument instructions
1063 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1065 my %cmllopf = ( "camellia_fl" => 0x13c,
1066 "camellia_fli" => 0x13d );
1068 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1070 if (defined($opf=$cmllopf{$mnemonic})) {
1071 foreach ($rs1,$rs2,$rd) {
1072 return $ref if (!/%f([0-9]{1,2})/);
1075 return $ref if ($1&1);
1076 # re-encode for upper double register addressing
1081 return sprintf ".word\t0x%08x !%s",
1082 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1089 sub unmovxtox { # 2-argument instructions
1090 my ($mnemonic,$rs,$rd)=@_;
1091 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1093 my %movxopf = ( "movdtox" => 0x110,
1094 "movstouw" => 0x111,
1095 "movstosw" => 0x113,
1097 "movwtos" => 0x119 );
1099 $ref = "$mnemonic\t$rs,$rd";
1101 if (defined($opf=$movxopf{$mnemonic})) {
1103 return $ref if (!/%([fgoli])([0-9]{1,2})/);
1106 return $ref if ($2&1);
1107 # re-encode for upper double register addressing
1112 return sprintf ".word\t0x%08x !%s",
1113 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1120 sub emit_assembler {
1121 foreach (split("\n",$::code)) {
1122 s/\`([^\`]*)\`/eval $1/ge;
1124 s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/g;
1126 s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1127 &unaes_round($1,$2,$3,$4,$5)
1129 s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1130 &unaes_kexpand($1,$2,$3,$4)
1132 s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1133 &uncamellia_f($1,$2,$3,$4,$5)
1135 s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1136 &uncamellia3($1,$2,$3,$4)
1138 s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1139 &unmovxtox($1,$2,$3)
1141 s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1142 &unmovxtox($1,$2,$3)
1144 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1147 s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1148 &unalignaddr($1,$2,$3,$4)
1150 s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1151 &unvis3($1,$2,$3,$4)