From 947716c1872d210828122212d076d503ae68b928 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 1 Sep 2016 10:39:15 +0200 Subject: [PATCH] MIPS assembly pack: adapt it for MIPS[32|64]R6. MIPS[32|64]R6 is binary and source incompatible with previous MIPS ISA specifications. Fortunately it's still possible to resolve differences in source code with standard pre-processor and switching to trap-free version of addition and subtraction instructions. Reviewed-by: Richard Levitte --- Configurations/00-base-templates.conf | 2 +- crypto/aes/asm/aes-mips.pl | 87 ++- crypto/aes/build.info | 1 + crypto/bn/asm/mips-mont.pl | 102 ++-- crypto/bn/asm/mips.pl | 742 +++++++++++++------------- crypto/bn/build.info | 16 +- crypto/mips_arch.h | 40 ++ crypto/poly1305/asm/poly1305-mips.pl | 50 +- crypto/poly1305/build.info | 1 + crypto/sha/asm/sha1-mips.pl | 26 +- crypto/sha/asm/sha512-mips.pl | 22 +- crypto/sha/build.info | 3 + 12 files changed, 609 insertions(+), 483 deletions(-) create mode 100644 crypto/mips_arch.h diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf index 1c63a50136..8bb4de75f9 100644 --- a/Configurations/00-base-templates.conf +++ b/Configurations/00-base-templates.conf @@ -216,7 +216,7 @@ }, mips32_asm => { template => 1, - bn_asm_src => "bn-mips.s mips-mont.s", + bn_asm_src => "bn-mips.S mips-mont.S", aes_asm_src => "aes_cbc.c aes-mips.S", sha1_asm_src => "sha1-mips.S sha256-mips.S", }, diff --git a/crypto/aes/asm/aes-mips.pl b/crypto/aes/asm/aes-mips.pl index 439578d9c2..0eb1474a3a 100644 --- a/crypto/aes/asm/aes-mips.pl +++ b/crypto/aes/asm/aes-mips.pl @@ -65,8 +65,8 @@ $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { $PTR_LA="dla"; - $PTR_ADD="dadd"; # incidentally works even on n32 - $PTR_SUB="dsub"; # incidentally works even on n32 + $PTR_ADD="daddu"; # incidentally works even on n32 + $PTR_SUB="dsubu"; # incidentally works even on n32 $PTR_INS="dins"; $REG_S="sd"; $REG_L="ld"; @@ -74,8 +74,8 @@ if ($flavour =~ /64|n32/i) { $SZREG=8; } else { $PTR_LA="la"; - $PTR_ADD="add"; - $PTR_SUB="sub"; + $PTR_ADD="addu"; + $PTR_SUB="subu"; $PTR_INS="ins"; $REG_S="sw"; $REG_L="lw"; @@ -102,15 +102,13 @@ open STDOUT,">$output"; my ($MSB,$LSB)=(0,3); # automatically converted to little-endian $code.=<<___; +#include "mips_arch.h" + .text #ifdef OPENSSL_FIPSCANISTER # include #endif -#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2) -#define _MIPS_ARCH_MIPS32R2 -#endif - #if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__)) .option pic2 #endif @@ -146,7 +144,7 @@ _mips_AES_encrypt: xor $s2,$t2 xor $s3,$t3 - sub $cnt,1 + subu $cnt,1 #if defined(__mips_smartmips) ext $i0,$s1,16,8 .Loop_enc: @@ -218,7 +216,7 @@ _mips_AES_encrypt: xor $t2,$t6 xor $t3,$t7 - sub $cnt,1 + subu $cnt,1 $PTR_ADD $key0,16 xor $s0,$t0 xor $s1,$t1 @@ -409,7 +407,7 @@ _mips_AES_encrypt: xor $t2,$t6 xor $t3,$t7 - sub $cnt,1 + subu $cnt,1 $PTR_ADD $key0,16 xor $s0,$t0 xor $s1,$t1 @@ -657,6 +655,12 @@ $code.=<<___; .set reorder $PTR_LA $Tbl,AES_Te # PIC-ified 'load address' +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + lw $s0,0($inp) + lw $s1,4($inp) + lw $s2,8($inp) + lw $s3,12($inp) +#else lwl $s0,0+$MSB($inp) lwl $s1,4+$MSB($inp) lwl $s2,8+$MSB($inp) @@ -665,9 +669,16 @@ $code.=<<___; lwr $s1,4+$LSB($inp) lwr $s2,8+$LSB($inp) lwr $s3,12+$LSB($inp) +#endif bal _mips_AES_encrypt +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + sw $s0,0($out) + sw $s1,4($out) + sw $s2,8($out) + sw $s3,12($out) +#else swr $s0,0+$LSB($out) swr $s1,4+$LSB($out) swr $s2,8+$LSB($out) @@ -676,6 +687,7 @@ $code.=<<___; swl $s1,4+$MSB($out) swl $s2,8+$MSB($out) swl $s3,12+$MSB($out) +#endif .set noreorder $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) @@ -720,7 +732,7 @@ _mips_AES_decrypt: xor $s2,$t2 xor $s3,$t3 - sub $cnt,1 + subu $cnt,1 #if defined(__mips_smartmips) ext $i0,$s3,16,8 .Loop_dec: @@ -792,7 +804,7 @@ _mips_AES_decrypt: xor $t2,$t6 xor $t3,$t7 - sub $cnt,1 + subu $cnt,1 $PTR_ADD $key0,16 xor $s0,$t0 xor $s1,$t1 @@ -985,7 +997,7 @@ _mips_AES_decrypt: xor $t2,$t6 xor $t3,$t7 - sub $cnt,1 + subu $cnt,1 $PTR_ADD $key0,16 xor $s0,$t0 xor $s1,$t1 @@ -1228,6 +1240,12 @@ $code.=<<___; .set reorder $PTR_LA $Tbl,AES_Td # PIC-ified 'load address' +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + lw $s0,0($inp) + lw $s1,4($inp) + lw $s2,8($inp) + lw $s3,12($inp) +#else lwl $s0,0+$MSB($inp) lwl $s1,4+$MSB($inp) lwl $s2,8+$MSB($inp) @@ -1236,9 +1254,16 @@ $code.=<<___; lwr $s1,4+$LSB($inp) lwr $s2,8+$LSB($inp) lwr $s3,12+$LSB($inp) +#endif bal _mips_AES_decrypt +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + sw $s0,0($out) + sw $s1,4($out) + sw $s2,8($out) + sw $s3,12($out) +#else swr $s0,0+$LSB($out) swr $s1,4+$LSB($out) swr $s2,8+$LSB($out) @@ -1247,6 +1272,7 @@ $code.=<<___; swl $s1,4+$MSB($out) swl $s2,8+$MSB($out) swl $s3,12+$MSB($out) +#endif .set noreorder $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) @@ -1295,35 +1321,52 @@ _mips_AES_set_encrypt_key: $PTR_ADD $rcon,$Tbl,256 .set reorder +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + lw $rk0,0($inp) # load 128 bits + lw $rk1,4($inp) + lw $rk2,8($inp) + lw $rk3,12($inp) +#else lwl $rk0,0+$MSB($inp) # load 128 bits lwl $rk1,4+$MSB($inp) lwl $rk2,8+$MSB($inp) lwl $rk3,12+$MSB($inp) - li $at,128 lwr $rk0,0+$LSB($inp) lwr $rk1,4+$LSB($inp) lwr $rk2,8+$LSB($inp) lwr $rk3,12+$LSB($inp) +#endif + li $at,128 .set noreorder beq $bits,$at,.L128bits li $cnt,10 .set reorder +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + lw $rk4,16($inp) # load 192 bits + lw $rk5,20($inp) +#else lwl $rk4,16+$MSB($inp) # load 192 bits lwl $rk5,20+$MSB($inp) - li $at,192 lwr $rk4,16+$LSB($inp) lwr $rk5,20+$LSB($inp) +#endif + li $at,192 .set noreorder beq $bits,$at,.L192bits li $cnt,8 .set reorder +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + lw $rk6,24($inp) # load 256 bits + lw $rk7,28($inp) +#else lwl $rk6,24+$MSB($inp) # load 256 bits lwl $rk7,28+$MSB($inp) - li $at,256 lwr $rk6,24+$LSB($inp) lwr $rk7,28+$LSB($inp) +#endif + li $at,256 .set noreorder beq $bits,$at,.L256bits li $cnt,7 @@ -1353,7 +1396,7 @@ _mips_AES_set_encrypt_key: sw $rk1,4($key) sw $rk2,8($key) sw $rk3,12($key) - sub $cnt,1 + subu $cnt,1 $PTR_ADD $key,16 _bias $i0,24 @@ -1410,7 +1453,7 @@ _mips_AES_set_encrypt_key: sw $rk3,12($key) sw $rk4,16($key) sw $rk5,20($key) - sub $cnt,1 + subu $cnt,1 $PTR_ADD $key,24 _bias $i0,24 @@ -1471,7 +1514,7 @@ _mips_AES_set_encrypt_key: sw $rk5,20($key) sw $rk6,24($key) sw $rk7,28($key) - sub $cnt,1 + subu $cnt,1 _bias $i0,24 _bias $i1,16 @@ -1653,7 +1696,7 @@ $code.=<<___; lw $tp1,16($key) # modulo-scheduled lui $x80808080,0x8080 - sub $cnt,1 + subu $cnt,1 or $x80808080,0x8080 sll $cnt,2 $PTR_ADD $key,16 @@ -1716,7 +1759,7 @@ $code.=<<___; lw $tp1,4($key) # modulo-scheduled xor $tpe,$tp2 #endif - sub $cnt,1 + subu $cnt,1 sw $tpe,0($key) $PTR_ADD $key,4 bnez $cnt,.Lmix diff --git a/crypto/aes/build.info b/crypto/aes/build.info index cf6cb5ec25..bcc71abaeb 100644 --- a/crypto/aes/build.info +++ b/crypto/aes/build.info @@ -35,6 +35,7 @@ GENERATE[aesp8-ppc.s]=asm/aesp8-ppc.pl $(PERLASM_SCHEME) GENERATE[aes-parisc.s]=asm/aes-parisc.pl $(PERLASM_SCHEME) GENERATE[aes-mips.S]=asm/aes-mips.pl $(PERLASM_SCHEME) +INCLUDE[aes-mips.o]=.. GENERATE[aesv8-armx.S]=asm/aesv8-armx.pl $(PERLASM_SCHEME) INCLUDE[aesv8-armx.o]=.. diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl index a907571bec..56d4202824 100644 --- a/crypto/bn/asm/mips-mont.pl +++ b/crypto/bn/asm/mips-mont.pl @@ -56,14 +56,14 @@ $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { - $PTR_ADD="dadd"; # incidentally works even on n32 - $PTR_SUB="dsub"; # incidentally works even on n32 + $PTR_ADD="daddu"; # incidentally works even on n32 + $PTR_SUB="dsubu"; # incidentally works even on n32 $REG_S="sd"; $REG_L="ld"; $SZREG=8; } else { - $PTR_ADD="add"; - $PTR_SUB="sub"; + $PTR_ADD="addu"; + $PTR_SUB="subu"; $REG_S="sw"; $REG_L="lw"; $SZREG=4; @@ -121,6 +121,8 @@ $m1=$s11; $FRAMESIZE=14; $code=<<___; +#include "mips_arch.h" + .text .set noat @@ -183,27 +185,27 @@ $code.=<<___; $PTR_SUB $sp,$num and $sp,$at - $MULTU $aj,$bi - $LD $alo,$BNSZ($ap) - $LD $nlo,$BNSZ($np) - mflo $lo0 - mfhi $hi0 - $MULTU $lo0,$n0 - mflo $m1 - - $MULTU $alo,$bi - mflo $alo - mfhi $ahi - - $MULTU $nj,$m1 - mflo $lo1 - mfhi $hi1 - $MULTU $nlo,$m1 + $MULTU ($aj,$bi) + $LD $ahi,$BNSZ($ap) + $LD $nhi,$BNSZ($np) + mflo ($lo0,$aj,$bi) + mfhi ($hi0,$aj,$bi) + $MULTU ($lo0,$n0) + mflo ($m1,$lo0,$n0) + + $MULTU ($ahi,$bi) + mflo ($alo,$ahi,$bi) + mfhi ($ahi,$ahi,$bi) + + $MULTU ($nj,$m1) + mflo ($lo1,$nj,$m1) + mfhi ($hi1,$nj,$m1) + $MULTU ($nhi,$m1) $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 $ADDU $hi1,$at - mflo $nlo - mfhi $nhi + mflo ($nlo,$nhi,$m1) + mfhi ($nhi,$nhi,$m1) move $tp,$sp li $j,2*$BNSZ @@ -215,25 +217,25 @@ $code.=<<___; $LD $aj,($aj) $LD $nj,($nj) - $MULTU $aj,$bi + $MULTU ($aj,$bi) $ADDU $lo0,$alo,$hi0 $ADDU $lo1,$nlo,$hi1 sltu $at,$lo0,$hi0 sltu $t0,$lo1,$hi1 $ADDU $hi0,$ahi,$at $ADDU $hi1,$nhi,$t0 - mflo $alo - mfhi $ahi + mflo ($alo,$aj,$bi) + mfhi ($ahi,$aj,$bi) $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 - $MULTU $nj,$m1 + $MULTU ($nj,$m1) $ADDU $hi1,$at addu $j,$BNSZ $ST $lo1,($tp) sltu $t0,$j,$num - mflo $nlo - mfhi $nhi + mflo ($nlo,$nj,$m1) + mfhi ($nhi,$nj,$m1) bnez $t0,.L1st $PTR_ADD $tp,$BNSZ @@ -263,34 +265,34 @@ $code.=<<___; $PTR_ADD $bi,$bp,$i $LD $bi,($bi) $LD $aj,($ap) - $LD $alo,$BNSZ($ap) + $LD $ahi,$BNSZ($ap) $LD $tj,($sp) - $MULTU $aj,$bi + $MULTU ($aj,$bi) $LD $nj,($np) - $LD $nlo,$BNSZ($np) - mflo $lo0 - mfhi $hi0 + $LD $nhi,$BNSZ($np) + mflo ($lo0,$aj,$bi) + mfhi ($hi0,$aj,$bi) $ADDU $lo0,$tj - $MULTU $lo0,$n0 + $MULTU ($lo0,$n0) sltu $at,$lo0,$tj $ADDU $hi0,$at - mflo $m1 + mflo ($m1,$lo0,$n0) - $MULTU $alo,$bi - mflo $alo - mfhi $ahi + $MULTU ($ahi,$bi) + mflo ($alo,$ahi,$bi) + mfhi ($ahi,$ahi,$bi) - $MULTU $nj,$m1 - mflo $lo1 - mfhi $hi1 + $MULTU ($nj,$m1) + mflo ($lo1,$nj,$m1) + mfhi ($hi1,$nj,$m1) - $MULTU $nlo,$m1 + $MULTU ($nhi,$m1) $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 $ADDU $hi1,$at - mflo $nlo - mfhi $nhi + mflo ($nlo,$nhi,$m1) + mfhi ($nhi,$nhi,$m1) move $tp,$sp li $j,2*$BNSZ @@ -303,19 +305,19 @@ $code.=<<___; $LD $aj,($aj) $LD $nj,($nj) - $MULTU $aj,$bi + $MULTU ($aj,$bi) $ADDU $lo0,$alo,$hi0 $ADDU $lo1,$nlo,$hi1 sltu $at,$lo0,$hi0 sltu $t0,$lo1,$hi1 $ADDU $hi0,$ahi,$at $ADDU $hi1,$nhi,$t0 - mflo $alo - mfhi $ahi + mflo ($alo,$aj,$bi) + mfhi ($ahi,$aj,$bi) $ADDU $lo0,$tj addu $j,$BNSZ - $MULTU $nj,$m1 + $MULTU ($nj,$m1) sltu $at,$lo0,$tj $ADDU $lo1,$lo0 $ADDU $hi0,$at @@ -323,8 +325,8 @@ $code.=<<___; $LD $tj,2*$BNSZ($tp) $ADDU $hi1,$t0 sltu $at,$j,$num - mflo $nlo - mfhi $nhi + mflo ($nlo,$nj,$m1) + mfhi ($nhi,$nj,$m1) $ST $lo1,($tp) bnez $at,.Linner $PTR_ADD $tp,$BNSZ diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl index 420f01f3a4..102b656229 100644 --- a/crypto/bn/asm/mips.pl +++ b/crypto/bn/asm/mips.pl @@ -109,6 +109,22 @@ $gp=$v1 if ($flavour =~ /nubi/i); $minus4=$v1; $code.=<<___; +#include "mips_arch.h" + +#if defined(_MIPS_ARCH_MIPS64R6) +# define ddivu(rs,rt) +# define mfqt(rd,rs,rt) ddivu rd,rs,rt +# define mfrm(rd,rs,rt) dmodu rd,rs,rt +#elif defined(_MIPS_ARCH_MIPS32R6) +# define divu(rs,rt) +# define mfqt(rd,rs,rt) divu rd,rs,rt +# define mfrm(rd,rs,rt) modu rd,rs,rt +#else +# define $DIVU(rs,rt) $DIVU $zero,rs,rt +# define mfqt(rd,rs,rt) mflo rd +# define mfrm(rd,rs,rt) mfhi rd +#endif + .rdata .asciiz "mips3.s, Version 1.2" .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov " @@ -151,7 +167,7 @@ $code.=<<___; .L_bn_mul_add_words_loop: $LD $t0,0($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) $LD $t1,0($a0) $LD $t2,$BNSZ($a1) $LD $t3,$BNSZ($a0) @@ -161,11 +177,11 @@ $code.=<<___; sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit # values", but it seems to work fine # even on 64-bit registers. - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $t1,$at $ADDU $v0,$t0 - $MULTU $t2,$a3 + $MULTU ($t2,$a3) sltu $at,$t1,$at $ST $t1,0($a0) $ADDU $v0,$at @@ -174,11 +190,11 @@ $code.=<<___; $LD $ta3,3*$BNSZ($a0) $ADDU $t3,$v0 sltu $v0,$t3,$v0 - mflo $at - mfhi $t2 + mflo ($at,$t2,$a3) + mfhi ($t2,$t2,$a3) $ADDU $t3,$at $ADDU $v0,$t2 - $MULTU $ta0,$a3 + $MULTU ($ta0,$a3) sltu $at,$t3,$at $ST $t3,$BNSZ($a0) $ADDU $v0,$at @@ -188,11 +204,11 @@ $code.=<<___; $PTR_ADD $a1,4*$BNSZ $ADDU $ta1,$v0 sltu $v0,$ta1,$v0 - mflo $at - mfhi $ta0 + mflo ($at,$ta0,$a3) + mfhi ($ta0,$ta0,$a3) $ADDU $ta1,$at $ADDU $v0,$ta0 - $MULTU $ta2,$a3 + $MULTU ($ta2,$a3) sltu $at,$ta1,$at $ST $ta1,-2*$BNSZ($a0) $ADDU $v0,$at @@ -201,8 +217,8 @@ $code.=<<___; and $ta0,$a2,$minus4 $ADDU $ta3,$v0 sltu $v0,$ta3,$v0 - mflo $at - mfhi $ta2 + mflo ($at,$ta2,$a3) + mfhi ($ta2,$ta2,$a3) $ADDU $ta3,$at $ADDU $v0,$ta2 sltu $at,$ta3,$at @@ -217,13 +233,13 @@ $code.=<<___; .L_bn_mul_add_words_tail: .set reorder $LD $t0,0($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) $LD $t1,0($a0) subu $a2,1 $ADDU $t1,$v0 sltu $v0,$t1,$v0 - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $t1,$at $ADDU $v0,$t0 sltu $at,$t1,$at @@ -232,13 +248,13 @@ $code.=<<___; beqz $a2,.L_bn_mul_add_words_return $LD $t0,$BNSZ($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) $LD $t1,$BNSZ($a0) subu $a2,1 $ADDU $t1,$v0 sltu $v0,$t1,$v0 - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $t1,$at $ADDU $v0,$t0 sltu $at,$t1,$at @@ -247,12 +263,12 @@ $code.=<<___; beqz $a2,.L_bn_mul_add_words_return $LD $t0,2*$BNSZ($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) $LD $t1,2*$BNSZ($a0) $ADDU $t1,$v0 sltu $v0,$t1,$v0 - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $t1,$at $ADDU $v0,$t0 sltu $at,$t1,$at @@ -310,40 +326,40 @@ $code.=<<___; .L_bn_mul_words_loop: $LD $t0,0($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) $LD $t2,$BNSZ($a1) $LD $ta0,2*$BNSZ($a1) $LD $ta2,3*$BNSZ($a1) - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $v0,$at sltu $t1,$v0,$at - $MULTU $t2,$a3 + $MULTU ($t2,$a3) $ST $v0,0($a0) $ADDU $v0,$t1,$t0 subu $a2,4 $PTR_ADD $a0,4*$BNSZ $PTR_ADD $a1,4*$BNSZ - mflo $at - mfhi $t2 + mflo ($at,$t2,$a3) + mfhi ($t2,$t2,$a3) $ADDU $v0,$at sltu $t3,$v0,$at - $MULTU $ta0,$a3 + $MULTU ($ta0,$a3) $ST $v0,-3*$BNSZ($a0) $ADDU $v0,$t3,$t2 - mflo $at - mfhi $ta0 + mflo ($at,$ta0,$a3) + mfhi ($ta0,$ta0,$a3) $ADDU $v0,$at sltu $ta1,$v0,$at - $MULTU $ta2,$a3 + $MULTU ($ta2,$a3) $ST $v0,-2*$BNSZ($a0) $ADDU $v0,$ta1,$ta0 and $ta0,$a2,$minus4 - mflo $at - mfhi $ta2 + mflo ($at,$ta2,$a3) + mfhi ($ta2,$ta2,$a3) $ADDU $v0,$at sltu $ta3,$v0,$at $ST $v0,-$BNSZ($a0) @@ -357,10 +373,10 @@ $code.=<<___; .L_bn_mul_words_tail: .set reorder $LD $t0,0($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) subu $a2,1 - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $v0,$at sltu $t1,$v0,$at $ST $v0,0($a0) @@ -368,10 +384,10 @@ $code.=<<___; beqz $a2,.L_bn_mul_words_return $LD $t0,$BNSZ($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) subu $a2,1 - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $v0,$at sltu $t1,$v0,$at $ST $v0,$BNSZ($a0) @@ -379,9 +395,9 @@ $code.=<<___; beqz $a2,.L_bn_mul_words_return $LD $t0,2*$BNSZ($a1) - $MULTU $t0,$a3 - mflo $at - mfhi $t0 + $MULTU ($t0,$a3) + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $v0,$at sltu $t1,$v0,$at $ST $v0,2*$BNSZ($a0) @@ -438,35 +454,35 @@ $code.=<<___; .L_bn_sqr_words_loop: $LD $t0,0($a1) - $MULTU $t0,$t0 + $MULTU ($t0,$t0) $LD $t2,$BNSZ($a1) $LD $ta0,2*$BNSZ($a1) $LD $ta2,3*$BNSZ($a1) - mflo $t1 - mfhi $t0 + mflo ($t1,$t0,$t0) + mfhi ($t0,$t0,$t0) $ST $t1,0($a0) $ST $t0,$BNSZ($a0) - $MULTU $t2,$t2 + $MULTU ($t2,$t2) subu $a2,4 $PTR_ADD $a0,8*$BNSZ $PTR_ADD $a1,4*$BNSZ - mflo $t3 - mfhi $t2 + mflo ($t3,$t2,$t2) + mfhi ($t2,$t2,$t2) $ST $t3,-6*$BNSZ($a0) $ST $t2,-5*$BNSZ($a0) - $MULTU $ta0,$ta0 - mflo $ta1 - mfhi $ta0 + $MULTU ($ta0,$ta0) + mflo ($ta1,$ta0,$ta0) + mfhi ($ta0,$ta0,$ta0) $ST $ta1,-4*$BNSZ($a0) $ST $ta0,-3*$BNSZ($a0) - $MULTU $ta2,$ta2 + $MULTU ($ta2,$ta2) and $ta0,$a2,$minus4 - mflo $ta3 - mfhi $ta2 + mflo ($ta3,$ta2,$ta2) + mfhi ($ta2,$ta2,$ta2) $ST $ta3,-2*$BNSZ($a0) .set noreorder @@ -479,27 +495,27 @@ $code.=<<___; .L_bn_sqr_words_tail: .set reorder $LD $t0,0($a1) - $MULTU $t0,$t0 + $MULTU ($t0,$t0) subu $a2,1 - mflo $t1 - mfhi $t0 + mflo ($t1,$t0,$t0) + mfhi ($t0,$t0,$t0) $ST $t1,0($a0) $ST $t0,$BNSZ($a0) beqz $a2,.L_bn_sqr_words_return $LD $t0,$BNSZ($a1) - $MULTU $t0,$t0 + $MULTU ($t0,$t0) subu $a2,1 - mflo $t1 - mfhi $t0 + mflo ($t1,$t0,$t0) + mfhi ($t0,$t0,$t0) $ST $t1,2*$BNSZ($a0) $ST $t0,3*$BNSZ($a0) beqz $a2,.L_bn_sqr_words_return $LD $t0,2*$BNSZ($a1) - $MULTU $t0,$t0 - mflo $t1 - mfhi $t0 + $MULTU ($t0,$t0) + mflo ($t1,$t0,$t0) + mfhi ($t0,$t0,$t0) $ST $t1,4*$BNSZ($a0) $ST $t0,5*$BNSZ($a0) @@ -823,11 +839,11 @@ $code.=<<___; move $ta3,$ra bal bn_div_words_internal move $ra,$ta3 - $MULTU $ta2,$v0 + $MULTU ($ta2,$v0) $LD $t2,-2*$BNSZ($a3) move $ta0,$zero - mfhi $t1 - mflo $t0 + mfhi ($t1,$ta2,$v0) + mflo ($t0,$ta2,$v0) sltu $t8,$t1,$a1 .L_bn_div_3_words_inner_loop: bnez $t8,.L_bn_div_3_words_inner_loop_done @@ -930,15 +946,15 @@ $code.=<<___; $SRL $HH,$a0,4*$BNSZ # bits $SRL $QT,4*$BNSZ # q=0xffffffff beq $DH,$HH,.L_bn_div_words_skip_div1 - $DIVU $zero,$a0,$DH - mflo $QT + $DIVU ($a0,$DH) + mfqt ($QT,$a0,$DH) .L_bn_div_words_skip_div1: - $MULTU $a2,$QT + $MULTU ($a2,$QT) $SLL $t3,$a0,4*$BNSZ # bits $SRL $at,$a1,4*$BNSZ # bits or $t3,$at - mflo $t0 - mfhi $t1 + mflo ($t0,$a2,$QT) + mfhi ($t1,$a2,$QT) .L_bn_div_words_inner_loop1: sltu $t2,$t3,$t0 seq $t8,$HH,$t1 @@ -963,15 +979,15 @@ $code.=<<___; $SRL $HH,$a0,4*$BNSZ # bits $SRL $QT,4*$BNSZ # q=0xffffffff beq $DH,$HH,.L_bn_div_words_skip_div2 - $DIVU $zero,$a0,$DH - mflo $QT + $DIVU ($a0,$DH) + mfqt ($QT,$a0,$DH) .L_bn_div_words_skip_div2: - $MULTU $a2,$QT + $MULTU ($a2,$QT) $SLL $t3,$a0,4*$BNSZ # bits $SRL $at,$a1,4*$BNSZ # bits or $t3,$at - mflo $t0 - mfhi $t1 + mflo ($t0,$a2,$QT) + mfhi ($t1,$a2,$QT) .L_bn_div_words_inner_loop2: sltu $t2,$t3,$t0 seq $t8,$HH,$t1 @@ -1070,592 +1086,592 @@ $code.=<<___; $LD $b_0,0($a2) $LD $a_1,$BNSZ($a1) $LD $a_2,2*$BNSZ($a1) - $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3); $LD $a_3,3*$BNSZ($a1) $LD $b_1,$BNSZ($a2) $LD $b_2,2*$BNSZ($a2) $LD $b_3,3*$BNSZ($a2) - mflo $c_1 - mfhi $c_2 + mflo ($c_1,$a_0,$b_0) + mfhi ($c_2,$a_0,$b_0) $LD $a_4,4*$BNSZ($a1) $LD $a_5,5*$BNSZ($a1) - $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); + $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1); $LD $a_6,6*$BNSZ($a1) $LD $a_7,7*$BNSZ($a1) $LD $b_4,4*$BNSZ($a2) $LD $b_5,5*$BNSZ($a2) - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_1) + mfhi ($t_2,$a_0,$b_1) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); + $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1); $ADDU $c_3,$t_2,$at $LD $b_6,6*$BNSZ($a2) $LD $b_7,7*$BNSZ($a2) $ST $c_1,0($a0) # r[0]=c1; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_0) + mfhi ($t_2,$a_1,$b_0) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); + $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 $ST $c_2,$BNSZ($a0) # r[1]=c2; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_0) + mfhi ($t_2,$a_2,$b_0) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); + $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_1) + mfhi ($t_2,$a_1,$b_1) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); + $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $c_2,$c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_2) + mfhi ($t_2,$a_0,$b_2) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); + $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) # r[2]=c3; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_3) + mfhi ($t_2,$a_0,$b_3) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); + $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $c_3,$c_2,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_2) + mfhi ($t_2,$a_1,$b_2) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); + $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_1) + mfhi ($t_2,$a_2,$b_1) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); + $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_0) + mfhi ($t_2,$a_3,$b_0) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); + $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,3*$BNSZ($a0) # r[3]=c1; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_0) + mfhi ($t_2,$a_4,$b_0) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); + $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_1) + mfhi ($t_2,$a_3,$b_1) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); + $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_2) + mfhi ($t_2,$a_2,$b_2) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); + $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_3) + mfhi ($t_2,$a_1,$b_3) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); + $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_4) + mfhi ($t_2,$a_0,$b_4) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); + $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) # r[4]=c2; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_5) + mfhi ($t_2,$a_0,$b_5) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); + $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $c_2,$c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_4) + mfhi ($t_2,$a_1,$b_4) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); + $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_3) + mfhi ($t_2,$a_2,$b_3) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); + $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_2) + mfhi ($t_2,$a_3,$b_2) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); + $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_1) + mfhi ($t_2,$a_4,$b_1) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); + $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_0) + mfhi ($t_2,$a_5,$b_0) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); + $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,5*$BNSZ($a0) # r[5]=c3; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_0) + mfhi ($t_2,$a_6,$b_0) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); + $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $c_3,$c_2,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_1) + mfhi ($t_2,$a_5,$b_1) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); + $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_2) + mfhi ($t_2,$a_4,$b_2) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); + $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_3) + mfhi ($t_2,$a_3,$b_3) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); + $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_4) + mfhi ($t_2,$a_2,$b_4) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); + $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_5) + mfhi ($t_2,$a_1,$b_5) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); + $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_6) + mfhi ($t_2,$a_0,$b_6) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); + $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,6*$BNSZ($a0) # r[6]=c1; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_7) + mfhi ($t_2,$a_0,$b_7) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); + $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_6) + mfhi ($t_2,$a_1,$b_6) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); + $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_5) + mfhi ($t_2,$a_2,$b_5) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); + $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_4) + mfhi ($t_2,$a_3,$b_4) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); + $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_3) + mfhi ($t_2,$a_4,$b_3) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); + $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_2) + mfhi ($t_2,$a_5,$b_2) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); + $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_1) + mfhi ($t_2,$a_6,$b_1) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); + $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_0) + mfhi ($t_2,$a_7,$b_0) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); + $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,7*$BNSZ($a0) # r[7]=c2; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_1) + mfhi ($t_2,$a_7,$b_1) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); + $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $c_2,$c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_2) + mfhi ($t_2,$a_6,$b_2) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); + $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_3) + mfhi ($t_2,$a_5,$b_3) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); + $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_4) + mfhi ($t_2,$a_4,$b_4) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); + $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_5) + mfhi ($t_2,$a_3,$b_5) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); + $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_6) + mfhi ($t_2,$a_2,$b_6) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); + $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_7) + mfhi ($t_2,$a_1,$b_7) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); + $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,8*$BNSZ($a0) # r[8]=c3; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_7) + mfhi ($t_2,$a_2,$b_7) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); + $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $c_3,$c_2,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_6) + mfhi ($t_2,$a_3,$b_6) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); + $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_5) + mfhi ($t_2,$a_4,$b_5) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); + $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_4) + mfhi ($t_2,$a_5,$b_4) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); + $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_3) + mfhi ($t_2,$a_6,$b_3) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); + $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_2) + mfhi ($t_2,$a_7,$b_2) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); + $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,9*$BNSZ($a0) # r[9]=c1; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_3) + mfhi ($t_2,$a_7,$b_3) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); + $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_4) + mfhi ($t_2,$a_6,$b_4) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); + $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_5) + mfhi ($t_2,$a_5,$b_5) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); + $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_6) + mfhi ($t_2,$a_4,$b_6) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); + $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_7) + mfhi ($t_2,$a_3,$b_7) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); + $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,10*$BNSZ($a0) # r[10]=c2; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_7) + mfhi ($t_2,$a_4,$b_7) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); + $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $c_2,$c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_6) + mfhi ($t_2,$a_5,$b_6) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); + $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_5) + mfhi ($t_2,$a_6,$b_5) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); + $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_4) + mfhi ($t_2,$a_7,$b_4) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); + $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,11*$BNSZ($a0) # r[11]=c3; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_5) + mfhi ($t_2,$a_7,$b_5) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); + $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $c_3,$c_2,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_6) + mfhi ($t_2,$a_6,$b_6) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); + $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_7) + mfhi ($t_2,$a_5,$b_7) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); + $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,12*$BNSZ($a0) # r[12]=c1; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_7) + mfhi ($t_2,$a_6,$b_7) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); + $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_6) + mfhi ($t_2,$a_7,$b_6) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); + $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,13*$BNSZ($a0) # r[13]=c2; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_7) + mfhi ($t_2,$a_7,$b_7) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at @@ -1716,144 +1732,144 @@ $code.=<<___; $LD $b_0,0($a2) $LD $a_1,$BNSZ($a1) $LD $a_2,2*$BNSZ($a1) - $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3); $LD $a_3,3*$BNSZ($a1) $LD $b_1,$BNSZ($a2) $LD $b_2,2*$BNSZ($a2) $LD $b_3,3*$BNSZ($a2) - mflo $c_1 - mfhi $c_2 + mflo ($c_1,$a_0,$b_0) + mfhi ($c_2,$a_0,$b_0) $ST $c_1,0($a0) - $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); - mflo $t_1 - mfhi $t_2 + $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1); + mflo ($t_1,$a_0,$b_1) + mfhi ($t_2,$a_0,$b_1) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); + $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1); $ADDU $c_3,$t_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_0) + mfhi ($t_2,$a_1,$b_0) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); + $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 $ST $c_2,$BNSZ($a0) - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_0) + mfhi ($t_2,$a_2,$b_0) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); + $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_1) + mfhi ($t_2,$a_1,$b_1) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); + $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $c_2,$c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_2) + mfhi ($t_2,$a_0,$b_2) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); + $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_3) + mfhi ($t_2,$a_0,$b_3) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); + $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $c_3,$c_2,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_2) + mfhi ($t_2,$a_1,$b_2) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); + $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_1) + mfhi ($t_2,$a_2,$b_1) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); + $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_0) + mfhi ($t_2,$a_3,$b_0) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); + $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,3*$BNSZ($a0) - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_1) + mfhi ($t_2,$a_3,$b_1) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); + $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_2) + mfhi ($t_2,$a_2,$b_2) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); + $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_3) + mfhi ($t_2,$a_1,$b_3) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); + $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_3) + mfhi ($t_2,$a_2,$b_3) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); + $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $c_2,$c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_2) + mfhi ($t_2,$a_3,$b_2) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); + $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,5*$BNSZ($a0) - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_3) + mfhi ($t_2,$a_3,$b_3) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at @@ -1888,11 +1904,9 @@ my ($hi,$lo,$c0,$c1,$c2, # commented as "forward multiplication" below]; )=@_; $code.=<<___; - mflo $lo - mfhi $hi $ADDU $c0,$lo sltu $at,$c0,$lo - $MULTU $an,$bn # forward multiplication + $MULTU ($an,$bn) # forward multiplication $ADDU $c0,$lo $ADDU $at,$hi sltu $lo,$c0,$lo @@ -1902,15 +1916,17 @@ ___ $code.=<<___ if (!$warm); sltu $c2,$c1,$at $ADDU $c1,$hi - sltu $hi,$c1,$hi - $ADDU $c2,$hi ___ $code.=<<___ if ($warm); sltu $at,$c1,$at $ADDU $c1,$hi $ADDU $c2,$at +___ +$code.=<<___; sltu $hi,$c1,$hi $ADDU $c2,$hi + mflo ($lo,$an,$bn) + mfhi ($hi,$an,$bn) ___ } @@ -1940,21 +1956,21 @@ $code.=<<___; $LD $a_2,2*$BNSZ($a1) $LD $a_3,3*$BNSZ($a1) - $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3); $LD $a_4,4*$BNSZ($a1) $LD $a_5,5*$BNSZ($a1) $LD $a_6,6*$BNSZ($a1) $LD $a_7,7*$BNSZ($a1) - mflo $c_1 - mfhi $c_2 + mflo ($c_1,$a_0,$a_0) + mfhi ($c_2,$a_0,$a_0) $ST $c_1,0($a0) - $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); - mflo $t_1 - mfhi $t_2 + $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1); + mflo ($t_1,$a_0,$a_1) + mfhi ($t_2,$a_0,$a_1) slt $c_1,$t_2,$zero $SLL $t_2,1 - $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); + $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 @@ -1962,20 +1978,22 @@ $code.=<<___; sltu $at,$c_2,$t_1 $ADDU $c_3,$t_2,$at $ST $c_2,$BNSZ($a0) + mflo ($t_1,$a_2,$a_0) + mfhi ($t_2,$a_2,$a_0) ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); + $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) + mflo ($t_1,$a_0,$a_3) + mfhi ($t_2,$a_0,$a_3) ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3); @@ -1989,16 +2007,16 @@ ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); + $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) + mflo ($t_1,$a_0,$a_5) + mfhi ($t_2,$a_0,$a_5) ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2); @@ -2016,16 +2034,16 @@ ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); + $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,6*$BNSZ($a0) + mflo ($t_1,$a_0,$a_7) + mfhi ($t_2,$a_0,$a_7) ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1); @@ -2045,16 +2063,16 @@ ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); + $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,8*$BNSZ($a0) + mflo ($t_1,$a_2,$a_7) + mfhi ($t_2,$a_2,$a_7) ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3); @@ -2070,16 +2088,16 @@ ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); + $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,10*$BNSZ($a0) + mflo ($t_1,$a_4,$a_7) + mfhi ($t_2,$a_4,$a_7) ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2); @@ -2091,24 +2109,22 @@ ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); + $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,12*$BNSZ($a0) + mflo ($t_1,$a_6,$a_7) + mfhi ($t_2,$a_6,$a_7) ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2); $code.=<<___; $ST $c_2,13*$BNSZ($a0) - mflo $t_1 - mfhi $t_2 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at @@ -2152,19 +2168,19 @@ $code.=<<___; .set reorder $LD $a_0,0($a1) $LD $a_1,$BNSZ($a1) - $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3); $LD $a_2,2*$BNSZ($a1) $LD $a_3,3*$BNSZ($a1) - mflo $c_1 - mfhi $c_2 + mflo ($c_1,$a_0,$a_0) + mfhi ($c_2,$a_0,$a_0) $ST $c_1,0($a0) - $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); - mflo $t_1 - mfhi $t_2 + $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1); + mflo ($t_1,$a_0,$a_1) + mfhi ($t_2,$a_0,$a_1) slt $c_1,$t_2,$zero $SLL $t_2,1 - $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); + $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 @@ -2172,20 +2188,22 @@ $code.=<<___; sltu $at,$c_2,$t_1 $ADDU $c_3,$t_2,$at $ST $c_2,$BNSZ($a0) + mflo ($t_1,$a_2,$a_0) + mfhi ($t_2,$a_2,$a_0) ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); + $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) + mflo ($t_1,$a_0,$a_3) + mfhi ($t_2,$a_0,$a_3) ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3); @@ -2197,24 +2215,22 @@ ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); + $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) + mflo ($t_1,$a_2,$a_3) + mfhi ($t_2,$a_2,$a_3) ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); $code.=<<___; $ST $c_3,5*$BNSZ($a0) - mflo $t_1 - mfhi $t_2 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at diff --git a/crypto/bn/build.info b/crypto/bn/build.info index c608ecce82..354410615b 100644 --- a/crypto/bn/build.info +++ b/crypto/bn/build.info @@ -34,8 +34,10 @@ INCLUDE[sparct4-mont.o]=.. GENERATE[sparcv9-gf2m.S]=asm/sparcv9-gf2m.pl $(PERLASM_SCHEME) INCLUDE[sparcv9-gf2m.o]=.. -GENERATE[bn-mips.s]=asm/mips.pl $(PERLASM_SCHEME) -GENERATE[mips-mont.s]=asm/mips-mont.pl $(PERLASM_SCHEME) +GENERATE[bn-mips.S]=asm/mips.pl $(PERLASM_SCHEME) +INCLUDE[bn-mips.o]=.. +GENERATE[mips-mont.S]=asm/mips-mont.pl $(PERLASM_SCHEME) +INCLUDE[mips-mont.o]=.. GENERATE[s390x-mont.S]=asm/s390x-mont.pl $(PERLASM_SCHEME) GENERATE[s390x-gf2m.s]=asm/s390x-gf2m.pl $(PERLASM_SCHEME) @@ -64,16 +66,8 @@ GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl $(PERLASM_SCHEME) INCLUDE[armv4-gf2m.o]=.. GENERATE[armv8-mont.S]=asm/armv8-mont.pl $(PERLASM_SCHEME) -OVERRIDES=bn-mips3.o pa-risc2W.o pa-risc2.c +OVERRIDES=pa-risc2W.o pa-risc2.c BEGINRAW[Makefile] -##### BN assembler implementations - -{- $builddir -}/bn-mips3.o: {- $sourcedir -}/asm/mips3.s - @if [ "$(CC)" = "gcc" ]; then \ - ABI=`expr "$(CFLAGS)" : ".*-mabi=\([n3264]*\)"` && \ - as -$$ABI -O -o $@ {- $sourcedir -}/asm/mips3.s; \ - else $(CC) -c $(CFLAGS) $(LIB_CFLAGS) -o $@ {- $sourcedir -}/asm/mips3.s; fi - # GNU assembler fails to compile PA-RISC2 modules, insist on calling # vendor assembler... {- $builddir -}/pa-risc2W.o: {- $sourcedir -}/asm/pa-risc2W.s diff --git a/crypto/mips_arch.h b/crypto/mips_arch.h new file mode 100644 index 0000000000..e0cd905169 --- /dev/null +++ b/crypto/mips_arch.h @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#ifndef __MIPS_ARCH_H__ +# define __MIPS_ARCH_H__ + +# if (defined(__mips_smartmips) || defined(_MIPS_ARCH_MIPS32R3) || \ + defined(_MIPS_ARCH_MIPS32R5) || defined(_MIPS_ARCH_MIPS32R6)) + && !defined(_MIPS_ARCH_MIPS32R2) +# define _MIPS_ARCH_MIPS32R2 +# endif + +# if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \ + defined(_MIPS_ARCH_MIPS64R6)) \ + && !defined(_MIPS_ARCH_MIPS64R2) +# define _MIPS_ARCH_MIPS64R2 +# endif + +# if defined(_MIPS_ARCH_MIPS64R6) +# define dmultu(rs,rt) +# define mflo(rd,rs,rt) dmulu rd,rs,rt +# define mfhi(rd,rs,rt) dmuhu rd,rs,rt +# elif defined(_MIPS_ARCH_MIPS32R6) +# define multu(rs,rt) +# define mflo(rd,rs,rt) mulu rd,rs,rt +# define mfhi(rd,rs,rt) muhu rd,rs,rt +# else +# define dmultu(rs,rt) dmultu rs,rt +# define multu(rs,rt) multu rs,rt +# define mflo(rd,rs,rt) mflo rd +# define mfhi(rd,rs,rt) mfhi rd +# endif + +#endif diff --git a/crypto/poly1305/asm/poly1305-mips.pl b/crypto/poly1305/asm/poly1305-mips.pl index d2b3e90d93..28b6772ee5 100755 --- a/crypto/poly1305/asm/poly1305-mips.pl +++ b/crypto/poly1305/asm/poly1305-mips.pl @@ -67,6 +67,8 @@ $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); $code.=<<___; +#include "mips_arch.h" + #ifdef MIPSEB # define MSB 0 # define LSB 7 @@ -92,10 +94,15 @@ poly1305_init: beqz $inp,.Lno_key +#if defined(_MIPS_ARCH_MIPS64R6) + ld $in0,0($inp) + ld $in1,8($inp) +#else ldl $in0,0+MSB($inp) ldl $in1,8+MSB($inp) ldr $in0,0+LSB($inp) ldr $in1,8+LSB($inp) +#endif #ifdef MIPSEB # if defined(_MIPS_ARCH_MIPS64R2) dsbh $in0,$in0 # byte swap @@ -182,7 +189,7 @@ poly1305_blocks_internal: .frame $sp,6*8,$ra .mask $SAVED_REGS_MASK,-8 .set noreorder - dsub $sp,6*8 + dsubu $sp,6*8 sd $s5,40($sp) sd $s4,32($sp) ___ @@ -204,11 +211,16 @@ $code.=<<___; ld $s1,40($ctx) .Loop: +#if defined(_MIPS_ARCH_MIPS64R6) + ld $in0,0($inp) # load input + ld $in1,8($inp) +#else ldl $in0,0+MSB($inp) # load input ldl $in1,8+MSB($inp) ldr $in0,0+LSB($inp) - daddiu $len,-1 ldr $in1,8+LSB($inp) +#endif + daddiu $len,-1 daddiu $inp,16 #ifdef MIPSEB # if defined(_MIPS_ARCH_MIPS64R2) @@ -258,42 +270,42 @@ $code.=<<___; sltu $tmp1,$h1,$in1 daddu $h1,$tmp0 - dmultu $r0,$h0 # h0*r0 + dmultu ($r0,$h0) # h0*r0 daddu $h2,$padbit sltu $tmp0,$h1,$tmp0 - mflo $d0 - mfhi $d1 + mflo ($d0,$r0,$h0) + mfhi ($d1,$r0,$h0) - dmultu $s1,$h1 # h1*5*r1 + dmultu ($s1,$h1) # h1*5*r1 daddu $tmp0,$tmp1 daddu $h2,$tmp0 - mflo $tmp0 - mfhi $tmp1 + mflo ($tmp0,$s1,$h1) + mfhi ($tmp1,$s1,$h1) - dmultu $r1,$h0 # h0*r1 + dmultu ($r1,$h0) # h0*r1 daddu $d0,$tmp0 daddu $d1,$tmp1 - mflo $tmp2 - mfhi $d2 + mflo ($tmp2,$r1,$h0) + mfhi ($d2,$r1,$h0) sltu $tmp0,$d0,$tmp0 daddu $d1,$tmp0 - dmultu $r0,$h1 # h1*r0 + dmultu ($r0,$h1) # h1*r0 daddu $d1,$tmp2 sltu $tmp2,$d1,$tmp2 - mflo $tmp0 - mfhi $tmp1 + mflo ($tmp0,$r0,$h1) + mfhi ($tmp1,$r0,$h1) daddu $d2,$tmp2 - dmultu $s1,$h2 # h2*5*r1 + dmultu ($s1,$h2) # h2*5*r1 daddu $d1,$tmp0 daddu $d2,$tmp1 - mflo $tmp2 + mflo ($tmp2,$s1,$h2) - dmultu $r0,$h2 # h2*r0 + dmultu ($r0,$h2) # h2*r0 sltu $tmp0,$d1,$tmp0 daddu $d2,$tmp0 - mflo $tmp3 + mflo ($tmp3,$r0,$h2) daddu $d1,$tmp2 daddu $d2,$tmp3 @@ -329,7 +341,7 @@ $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue ___ $code.=<<___; jr $ra - dadd $sp,6*8 + daddu $sp,6*8 .end poly1305_blocks_internal ___ } diff --git a/crypto/poly1305/build.info b/crypto/poly1305/build.info index d575f5a63e..358fa82212 100644 --- a/crypto/poly1305/build.info +++ b/crypto/poly1305/build.info @@ -13,6 +13,7 @@ INCLUDE[poly1305-armv4.o]=.. GENERATE[poly1305-armv8.S]=asm/poly1305-armv8.pl $(PERLASM_SCHEME) INCLUDE[poly1305-armv8.o]=.. GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl $(PERLASM_SCHEME) +INCLUDE[poly1305-mips.o]=.. BEGINRAW[Makefile(unix)] {- $builddir -}/poly1305-%.S: {- $sourcedir -}/asm/poly1305-%.pl diff --git a/crypto/sha/asm/sha1-mips.pl b/crypto/sha/asm/sha1-mips.pl index 882f9731cf..d9911c8503 100644 --- a/crypto/sha/asm/sha1-mips.pl +++ b/crypto/sha/asm/sha1-mips.pl @@ -56,15 +56,15 @@ $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { - $PTR_ADD="dadd"; # incidentally works even on n32 - $PTR_SUB="dsub"; # incidentally works even on n32 + $PTR_ADD="daddu"; # incidentally works even on n32 + $PTR_SUB="dsubu"; # incidentally works even on n32 $REG_S="sd"; $REG_L="ld"; $PTR_SLL="dsll"; # incidentally works even on n32 $SZREG=8; } else { - $PTR_ADD="add"; - $PTR_SUB="sub"; + $PTR_ADD="addu"; + $PTR_SUB="subu"; $REG_S="sw"; $REG_L="lw"; $PTR_SLL="sll"; @@ -126,10 +126,14 @@ $code.=<<___; addu $e,$K # $i xor $t0,$c,$d rotr $t1,$a,27 - lwl @X[$j],$j*4+$MSB($inp) and $t0,$b addu $e,$t1 +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + lw @X[$j],$j*4($inp) +#else + lwl @X[$j],$j*4+$MSB($inp) lwr @X[$j],$j*4+$LSB($inp) +#endif xor $t0,$d addu $e,@X[$i] rotr $b,$b,2 @@ -336,14 +340,12 @@ $FRAMESIZE=16; # large enough to accommodate NUBI saved registers $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000"; $code=<<___; +#include "mips_arch.h" + #ifdef OPENSSL_FIPSCANISTER # include #endif -#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2) -#define _MIPS_ARCH_MIPS32R2 -#endif - .text .set noat @@ -387,10 +389,16 @@ $code.=<<___; .align 4 .Loop: .set reorder +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + lui $K,0x5a82 + lw @X[0],($inp) + ori $K,0x7999 # K_00_19 +#else lwl @X[0],$MSB($inp) lui $K,0x5a82 lwr @X[0],$LSB($inp) ori $K,0x7999 # K_00_19 +#endif ___ for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); } for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); } diff --git a/crypto/sha/asm/sha512-mips.pl b/crypto/sha/asm/sha512-mips.pl index 5c2d23faaf..5464543db8 100644 --- a/crypto/sha/asm/sha512-mips.pl +++ b/crypto/sha/asm/sha512-mips.pl @@ -60,16 +60,16 @@ $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { $PTR_LA="dla"; - $PTR_ADD="dadd"; # incidentally works even on n32 - $PTR_SUB="dsub"; # incidentally works even on n32 + $PTR_ADD="daddu"; # incidentally works even on n32 + $PTR_SUB="dsubu"; # incidentally works even on n32 $REG_S="sd"; $REG_L="ld"; $PTR_SLL="dsll"; # incidentally works even on n32 $SZREG=8; } else { $PTR_LA="la"; - $PTR_ADD="add"; - $PTR_SUB="sub"; + $PTR_ADD="addu"; + $PTR_SUB="subu"; $REG_S="sw"; $REG_L="lw"; $PTR_SLL="sll"; @@ -135,8 +135,12 @@ my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]); $code.=<<___ if ($i<15); +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + ${LD} @X[1],`($i+1)*$SZ`($inp) +#else ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp) ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp) +#endif ___ $code.=<<___ if (!$big_endian && $i<16 && $SZ==4); #if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) @@ -298,14 +302,12 @@ $FRAMESIZE=16*$SZ+16*$SZREG; $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000"; $code.=<<___; +#include "mips_arch.h" + #ifdef OPENSSL_FIPSCANISTER # include #endif -#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2) -#define _MIPS_ARCH_MIPS32R2 -#endif - .text .set noat #if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__)) @@ -369,8 +371,12 @@ $code.=<<___; .align 5 .Loop: +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + ${LD} @X[0],($inp) +#else ${LD}l @X[0],$MSB($inp) ${LD}r @X[0],$LSB($inp) +#endif ___ for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } diff --git a/crypto/sha/build.info b/crypto/sha/build.info index 5843e50894..4b3225bfe3 100644 --- a/crypto/sha/build.info +++ b/crypto/sha/build.info @@ -39,8 +39,11 @@ GENERATE[sha256-parisc.s]=asm/sha512-parisc.pl $(PERLASM_SCHEME) GENERATE[sha512-parisc.s]=asm/sha512-parisc.pl $(PERLASM_SCHEME) GENERATE[sha1-mips.S]=asm/sha1-mips.pl $(PERLASM_SCHEME) +INCLUDE[sha1-mips.o]=.. GENERATE[sha256-mips.S]=asm/sha512-mips.pl $(PERLASM_SCHEME) +INCLUDE[sha256-mips.o]=.. GENERATE[sha512-mips.S]=asm/sha512-mips.pl $(PERLASM_SCHEME) +INCLUDE[sha512-mips.o]=.. GENERATE[sha1-armv4-large.S]=asm/sha1-armv4-large.pl $(PERLASM_SCHEME) INCLUDE[sha1-armv4-large.o]=.. -- 2.34.1