Minor MIPS III/IV tune-up.
authorAndy Polyakov <appro@openssl.org>
Tue, 24 Aug 1999 16:02:16 +0000 (16:02 +0000)
committerAndy Polyakov <appro@openssl.org>
Tue, 24 Aug 1999 16:02:16 +0000 (16:02 +0000)
Configure
crypto/bn/asm/mips3.s
crypto/bn/bn_div.c

index 838f3cd..fcd7b39 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -154,11 +154,11 @@ my %table=(
 # Only N32 and N64 ABIs are supported. If you need O32 ABI build, invoke
 # './Configure irix-[g]cc' manually.
 # -mips4 flag is added by ./config when appropriate.
 # Only N32 and N64 ABIs are supported. If you need O32 ABI build, invoke
 # './Configure irix-[g]cc' manually.
 # -mips4 flag is added by ./config when appropriate.
-"irix-mips3-gcc","gcc:-mabi=n32 -mmips-as -O3 -DTERMIOS -DB_ENDIAN::(unknown)::MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR SIXTY_FOUR_BIT:asm/mips3.o::",
-"irix-mips3-cc", "cc:-n32 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN::(unknown)::DES_PTR DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT:asm/mips3.o::",
+"irix-mips3-gcc","gcc:-mabi=n32 -mmips-as -O3 -DTERMIOS -DB_ENDIAN -DBN_DIV3W::(unknown)::MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR SIXTY_FOUR_BIT:asm/mips3.o::",
+"irix-mips3-cc", "cc:-n32 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN -DBN_DIV3W::(unknown)::DES_PTR DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT:asm/mips3.o::",
 # N64 ABI builds.
 # N64 ABI builds.
-"irix64-mips4-gcc","gcc:-mabi=64 -mips4 -mmips-as -O3 -DTERMIOS -DB_ENDIAN::(unknown)::DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG:asm/mips3.o::",
-"irix64-mips4-cc", "cc:-64 -mips4 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN::(unknown)::DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG:asm/mips3.o::",
+"irix64-mips4-gcc","gcc:-mabi=64 -mips4 -mmips-as -O3 -DTERMIOS -DB_ENDIAN -DBN_DIV3W::(unknown)::DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG:asm/mips3.o::",
+"irix64-mips4-cc", "cc:-64 -mips4 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN -DBN_DIV3W::(unknown)::DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG:asm/mips3.o::",
 
 # HPUX 9.X config.
 # Don't use the bundled cc.  It is broken.  Use HP ANSI C if possible, or
 
 # HPUX 9.X config.
 # Don't use the bundled cc.  It is broken.  Use HP ANSI C if possible, or
index 191345d..2df4dcd 100644 (file)
@@ -395,32 +395,32 @@ LEAF(bn_add_words)
 
 .L_bn_add_words_loop:
        ld      ta0,0(a2)
 
 .L_bn_add_words_loop:
        ld      ta0,0(a2)
+       subu    a3,4
        ld      t1,8(a1)
        ld      t1,8(a1)
-       ld      ta1,8(a2)
+       and     AT,a3,MINUS4
        ld      t2,16(a1)
        ld      t2,16(a1)
-       ld      ta2,16(a2)
+       PTR_ADD a2,32
        ld      t3,24(a1)
        ld      t3,24(a1)
-       ld      ta3,24(a2)
+       PTR_ADD a0,32
+       ld      ta1,-24(a2)
+       PTR_ADD a1,32
+       ld      ta2,-16(a2)
+       ld      ta3,-8(a2)
        daddu   ta0,t0
        daddu   ta0,t0
-       subu    a3,4
        sltu    t8,ta0,t0
        daddu   t0,ta0,v0
        sltu    t8,ta0,t0
        daddu   t0,ta0,v0
-       PTR_ADD a0,32
        sltu    v0,t0,ta0
        sd      t0,-32(a0)
        daddu   v0,t8
 
        daddu   ta1,t1
        sltu    v0,t0,ta0
        sd      t0,-32(a0)
        daddu   v0,t8
 
        daddu   ta1,t1
-       PTR_ADD a1,32
        sltu    t9,ta1,t1
        daddu   t1,ta1,v0
        sltu    t9,ta1,t1
        daddu   t1,ta1,v0
-       PTR_ADD a2,32
        sltu    v0,t1,ta1
        sd      t1,-24(a0)
        daddu   v0,t9
 
        daddu   ta2,t2
        sltu    v0,t1,ta1
        sd      t1,-24(a0)
        daddu   v0,t9
 
        daddu   ta2,t2
-       and     AT,a3,MINUS4
        sltu    t8,ta2,t2
        daddu   t2,ta2,v0
        sltu    v0,t2,ta2
        sltu    t8,ta2,t2
        daddu   t2,ta2,v0
        sltu    v0,t2,ta2
@@ -495,25 +495,26 @@ LEAF(bn_sub_words)
 
 .L_bn_sub_words_loop:
        ld      ta0,0(a2)
 
 .L_bn_sub_words_loop:
        ld      ta0,0(a2)
+       subu    a3,4
        ld      t1,8(a1)
        ld      t1,8(a1)
-       ld      ta1,8(a2)
+       and     AT,a3,MINUS4
        ld      t2,16(a1)
        ld      t2,16(a1)
-       ld      ta2,16(a2)
+       PTR_ADD a2,32
        ld      t3,24(a1)
        ld      t3,24(a1)
-       ld      ta3,24(a2)
+       PTR_ADD a0,32
+       ld      ta1,-24(a2)
+       PTR_ADD a1,32
+       ld      ta2,-16(a2)
+       ld      ta3,-8(a2)
        sltu    t8,t0,ta0
        dsubu   t0,ta0
        sltu    t8,t0,ta0
        dsubu   t0,ta0
-       subu    a3,4
        dsubu   ta0,t0,v0
        dsubu   ta0,t0,v0
-       and     AT,a3,MINUS4
-       sd      ta0,0(a0)
+       sd      ta0,-32(a0)
        MOVNZ   (t0,v0,t8)
 
        sltu    t9,t1,ta1
        dsubu   t1,ta1
        MOVNZ   (t0,v0,t8)
 
        sltu    t9,t1,ta1
        dsubu   t1,ta1
-       PTR_ADD a0,32
        dsubu   ta1,t1,v0
        dsubu   ta1,t1,v0
-       PTR_ADD a1,32
        sd      ta1,-24(a0)
        MOVNZ   (t1,v0,t9)
 
        sd      ta1,-24(a0)
        MOVNZ   (t1,v0,t9)
 
@@ -521,7 +522,6 @@ LEAF(bn_sub_words)
        sltu    t8,t2,ta2
        dsubu   t2,ta2
        dsubu   ta2,t2,v0
        sltu    t8,t2,ta2
        dsubu   t2,ta2
        dsubu   ta2,t2,v0
-       PTR_ADD a2,32
        sd      ta2,-16(a0)
        MOVNZ   (t2,v0,t8)
 
        sd      ta2,-16(a0)
        MOVNZ   (t2,v0,t8)
 
@@ -574,6 +574,51 @@ END(bn_sub_words)
 
 #undef MINUS4
 
 
 #undef MINUS4
 
+.align 5
+LEAF(bn_div_3_words)
+       .set    reorder
+       move    a3,a0           /* we know that bn_div_words doesn't
+                                * touch a3, ta2, ta3 and preserves a2
+                                * so that we can save two arguments
+                                * and return address in registers
+                                * instead of stack:-)
+                                */
+       ld      a0,(a3)
+       move    ta2,a1
+       ld      a1,-8(a3)
+       move    ta3,ra
+       move    v1,zero
+       li      v0,-1
+       beq     a0,a2,.L_bn_div_3_words_skip_div
+       bal     bn_div_words
+       move    ra,ta3
+.L_bn_div_3_words_skip_div:
+       dmultu  ta2,v0
+       ld      t2,-16(a3)
+       move    ta0,zero
+       mfhi    t1
+       mflo    t0
+       sltu    t8,t1,v1
+.L_bn_div_3_words_inner_loop:
+       bnez    t8,.L_bn_div_3_words_inner_loop_done
+       sgeu    AT,t2,t0
+       seq     t9,t1,v1
+       and     AT,t9
+       sltu    t3,t0,ta2
+       daddu   v1,a2
+       dsubu   t1,t3
+       dsubu   t0,ta2
+       sltu    t8,t1,v1
+       sltu    ta0,v1,a2
+       or      t8,ta0
+       .set    noreorder
+       beqzl   AT,.L_bn_div_3_words_inner_loop
+       dsubu   v0,1
+       .set    reorder
+.L_bn_div_3_words_inner_loop_done:
+       jr      ra
+END(bn_div_3_words)
+
 .align 5
 LEAF(bn_div_words)
        .set    noreorder
 .align 5
 LEAF(bn_div_words)
        .set    noreorder
@@ -633,16 +678,16 @@ LEAF(bn_div_words)
        seq     t8,HH,t1
        sltu    AT,HH,t1
        and     t2,t8
        seq     t8,HH,t1
        sltu    AT,HH,t1
        and     t2,t8
+       sltu    v0,t0,a2
        or      AT,t2
        .set    noreorder
        beqz    AT,.L_bn_div_words_inner_loop1_done
        or      AT,t2
        .set    noreorder
        beqz    AT,.L_bn_div_words_inner_loop1_done
-       sltu    t2,t0,a2
-       .set    reorder
-       dsubu   QT,1
+       dsubu   t1,v0
        dsubu   t0,a2
        dsubu   t0,a2
-       dsubu   t1,t2
        b       .L_bn_div_words_inner_loop1
        b       .L_bn_div_words_inner_loop1
-.L_bn_div_words_inner_loop1_done:      
+       dsubu   QT,1
+       .set    reorder
+.L_bn_div_words_inner_loop1_done:
 
        dsll    a1,32
        dsubu   a0,t3,t0
 
        dsll    a1,32
        dsubu   a0,t3,t0
@@ -655,6 +700,7 @@ LEAF(bn_div_words)
        ddivu   zero,a0,DH
        mflo    QT
 .L_bn_div_words_skip_div2:
        ddivu   zero,a0,DH
        mflo    QT
 .L_bn_div_words_skip_div2:
+#undef DH
        dmultu  a2,QT
        dsll    t3,a0,32
        dsrl    AT,a1,32
        dmultu  a2,QT
        dsll    t3,a0,32
        dsrl    AT,a1,32
@@ -666,69 +712,26 @@ LEAF(bn_div_words)
        seq     t8,HH,t1
        sltu    AT,HH,t1
        and     t2,t8
        seq     t8,HH,t1
        sltu    AT,HH,t1
        and     t2,t8
+       sltu    v1,t0,a2
        or      AT,t2
        .set    noreorder
        beqz    AT,.L_bn_div_words_inner_loop2_done
        or      AT,t2
        .set    noreorder
        beqz    AT,.L_bn_div_words_inner_loop2_done
-       sltu    t2,t0,a2
-       .set    reorder
-       dsubu   QT,1
+       dsubu   t1,v1
        dsubu   t0,a2
        dsubu   t0,a2
-       dsubu   t1,t2
        b       .L_bn_div_words_inner_loop2
        b       .L_bn_div_words_inner_loop2
+       dsubu   QT,1
+       .set    reorder
 .L_bn_div_words_inner_loop2_done:      
 .L_bn_div_words_inner_loop2_done:      
+#undef HH
 
        dsubu   a0,t3,t0
        or      v0,QT
        dsrl    v1,a0,t9        /* v1 contains remainder if anybody wants it */
        dsrl    a2,t9           /* restore a2 */
        jr      ra
 
        dsubu   a0,t3,t0
        or      v0,QT
        dsrl    v1,a0,t9        /* v1 contains remainder if anybody wants it */
        dsrl    a2,t9           /* restore a2 */
        jr      ra
-#undef HH
-#undef DH
 #undef QT
 END(bn_div_words)
 
 #undef QT
 END(bn_div_words)
 
-.align 5
-LEAF(bn_div_3_words)
-       .set    reorder
-       move    a3,a0           /* we know that bn_div_words doesn't
-                                * touch a3, ta2, ta3 and preserves a2
-                                * so that we can save two arguments
-                                * and return address in registers
-                                * instead of stack:-)
-                                */
-       ld      a0,(a3)
-       move    ta2,a2
-       move    a2,a1
-       ld      a1,-8(a3)
-       move    ta3,ra
-       move    v1,zero
-       li      v0,-1
-       beq     a0,a2,.L_bn_div_3_words_skip_div
-       jal     bn_div_words
-       move    ra,ta3
-.L_bn_div_3_words_skip_div:
-       dmultu  ta2,v0
-       ld      t2,-16(a3)
-       mflo    t0
-       mfhi    t1
-.L_bn_div_3_words_inner_loop:
-       sgeu    AT,t2,t0
-       seq     t9,t1,v1
-       sltu    t8,t1,v1
-       and     AT,t9
-       or      AT,t8
-       bnez    AT,.L_bn_div_3_words_inner_loop_done
-       daddu   v1,a2
-       sltu    t3,t0,ta2
-       sltu    AT,v1,a2
-       dsubu   v0,1
-       dsubu   t0,ta2
-       dsubu   t1,t3
-       beqz    AT,.L_bn_div_3_words_inner_loop
-.L_bn_div_3_words_inner_loop_done:
-       jr      ra
-END(bn_div_3_words)
-
 #define        a_0     t0
 #define        a_1     t1
 #define        a_2     t2
 #define        a_0     t0
 #define        a_1     t1
 #define        a_2     t2
index 150dd28..6dd5d99 100644 (file)
@@ -202,7 +202,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
                {
                BN_ULONG q,l0;
 #ifdef BN_DIV3W
                {
                BN_ULONG q,l0;
 #ifdef BN_DIV3W
-               q=bn_div_3_words(wnump,d0,d1);
+               q=bn_div_3_words(wnump,d1,d0);
 #else
 
 #if !defined(NO_ASM) && !defined(PEDANTIC)
 #else
 
 #if !defined(NO_ASM) && !defined(PEDANTIC)
@@ -291,8 +291,8 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 #endif
                }
 #endif /* !BN_DIV3W */
 #endif
                }
 #endif /* !BN_DIV3W */
-               wnum.d--; wnum.top++;
                l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
                l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
+               wnum.d--; wnum.top++;
                tmp->d[div_n]=l0;
                for (j=div_n+1; j>0; j--)
                        if (tmp->d[j-1]) break;
                tmp->d[div_n]=l0;
                for (j=div_n+1; j>0; j--)
                        if (tmp->d[j-1]) break;