2 .asciiz "mips3.s, Version 1.0"
3 .asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
6 * ====================================================================
7 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
10 * Rights for redistribution and usage in source and binary forms are
11 * granted according to the OpenSSL license. Warranty of any kind is
13 * ====================================================================
17 * This is my modest contributon to the OpenSSL project (see
18 * http://www.openssl.org/ for more information about it) and is
19 * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
20 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
22 * The module is designed to work with either of the "new" MIPS ABI(5),
23 * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
24 * IRIX 5.x not only because it doesn't support new ABIs but also
25 * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
26 * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
27 * cause illegal instruction exception:-(
29 * In addition the code depends on preprocessor flags set up by MIPSpro
30 * compiler driver (either as or cc) and therefore (probably?) can't be
31 * compiled by the GNU assembler. GNU C driver manages fine though...
32 * I mean as long as -mmips-as is specified or is the default option,
33 * because then it simply invokes /usr/bin/as which in turn takes
34 * perfect care of the preprocessor definitions. Another neat feature
35 * offered by the MIPSpro assembler is an optimization pass. This gave
36 * me the opportunity to have the code looking more regular as all those
37 * architecture dependent instruction rescheduling details were left to
38 * the assembler. Cool, huh?
40 * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
41 * goes way over 3 times faster!
43 * <appro@fy.chalmers.se>
49 #define MOVNZ(cond,dst,src) \
52 #define MOVNZ(cond,dst,src) \
67 LEAF(bn_mul_add_words)
69 bgtzl a2,.L_bn_mul_add_words_proceed
75 .L_bn_mul_add_words_proceed:
79 beqz ta0,.L_bn_mul_add_words_tail
81 .L_bn_mul_add_words_loop:
89 sltu v0,t1,v0 /* All manuals say it "compares 32-bit
90 * values", but it seems to work fine
91 * even on 64-bit registers. */
140 bgtzl ta0,.L_bn_mul_add_words_loop
143 bnezl a2,.L_bn_mul_add_words_tail
147 .L_bn_mul_add_words_return:
150 .L_bn_mul_add_words_tail:
163 beqz a2,.L_bn_mul_add_words_return
178 beqz a2,.L_bn_mul_add_words_return
193 END(bn_mul_add_words)
198 bgtzl a2,.L_bn_mul_words_proceed
204 .L_bn_mul_words_proceed:
208 beqz ta0,.L_bn_mul_words_tail
210 .L_bn_mul_words_loop:
251 bgtzl ta0,.L_bn_mul_words_loop
254 bnezl a2,.L_bn_mul_words_tail
258 .L_bn_mul_words_return:
261 .L_bn_mul_words_tail:
270 beqz a2,.L_bn_mul_words_return
281 beqz a2,.L_bn_mul_words_return
297 bgtzl a2,.L_bn_sqr_words_proceed
303 .L_bn_sqr_words_proceed:
307 beqz ta0,.L_bn_sqr_words_tail
309 .L_bn_sqr_words_loop:
343 bgtzl ta0,.L_bn_sqr_words_loop
346 bnezl a2,.L_bn_sqr_words_tail
350 .L_bn_sqr_words_return:
354 .L_bn_sqr_words_tail:
361 beqz a2,.L_bn_sqr_words_return
370 beqz a2,.L_bn_sqr_words_return
384 bgtzl a3,.L_bn_add_words_proceed
390 .L_bn_add_words_proceed:
394 beqz AT,.L_bn_add_words_tail
396 .L_bn_add_words_loop:
438 bgtzl AT,.L_bn_add_words_loop
441 bnezl a3,.L_bn_add_words_tail
445 .L_bn_add_words_return:
448 .L_bn_add_words_tail:
457 beqz a3,.L_bn_add_words_return
468 beqz a3,.L_bn_add_words_return
484 bgtzl a3,.L_bn_sub_words_proceed
490 .L_bn_sub_words_proceed:
494 beqz AT,.L_bn_sub_words_tail
496 .L_bn_sub_words_loop:
535 bgtzl AT,.L_bn_sub_words_loop
538 bnezl a3,.L_bn_sub_words_tail
542 .L_bn_sub_words_return:
545 .L_bn_sub_words_tail:
553 beqz a3,.L_bn_sub_words_return
563 beqz a3,.L_bn_sub_words_return
580 move a3,a0 /* we know that bn_div_words doesn't
581 * touch a3, ta2, ta3 and preserves a2
582 * so that we can save two arguments
583 * and return address in registers
584 * instead of stack:-)
589 bne a0,a2,.L_bn_div_3_words_proceed
592 .L_bn_div_3_words_proceed:
602 .L_bn_div_3_words_inner_loop:
603 bnez t8,.L_bn_div_3_words_inner_loop_done
615 beqzl AT,.L_bn_div_3_words_inner_loop
618 .L_bn_div_3_words_inner_loop_done:
625 bnezl a2,.L_bn_div_words_proceed
628 li v0,-1 /* I'd rather signal div-by-zero
629 * which can be done with 'break 7' */
631 .L_bn_div_words_proceed:
632 bltz a2,.L_bn_div_words_body
646 break 6 /* signal overflow */
655 .L_bn_div_words_body:
665 dsrl QT,32 /* q=0xffffffff */
666 beq DH,HH,.L_bn_div_words_skip_div1
669 .L_bn_div_words_skip_div1:
676 .L_bn_div_words_inner_loop1:
684 beqz AT,.L_bn_div_words_inner_loop1_done
687 b .L_bn_div_words_inner_loop1
690 .L_bn_div_words_inner_loop1_done:
698 dsrl QT,32 /* q=0xffffffff */
699 beq DH,HH,.L_bn_div_words_skip_div2
702 .L_bn_div_words_skip_div2:
710 .L_bn_div_words_inner_loop2:
718 beqz AT,.L_bn_div_words_inner_loop2_done
721 b .L_bn_div_words_inner_loop2
724 .L_bn_div_words_inner_loop2_done:
729 dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */
730 dsrl a2,t9 /* restore a2 */
747 #define a_7 a1 /* once we load a[7] we don't need a anymore */
751 #define b_7 a2 /* once we load b[7] we don't need b anymore */
760 #define FRAME_SIZE 48
765 PTR_SUB sp,FRAME_SIZE
768 ld a_0,0(a1) /* If compiled with -mips3 option on
769 * R5000 box assembler barks on this
770 * line with "shouldn't have mult/div
771 * as last instruction in bb (R10K
772 * bug)" warning. If anybody out there
773 * has a clue about how to circumvent
774 * this do send me a note.
775 * <appro@fy.chalmers.se>
784 dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
794 dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
806 dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
809 sd c_1,0(a0) /* r[0]=c1; */
817 sd c_2,8(a0) /* r[1]=c2; */
819 dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
826 dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
834 dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
843 sd c_3,16(a0) /* r[2]=c3; */
845 dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
852 dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
860 dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
869 dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
878 sd c_1,24(a0) /* r[3]=c1; */
880 dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */
887 dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
895 dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
904 dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
913 dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */
922 sd c_2,32(a0) /* r[4]=c2; */
924 dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */
931 dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */
939 dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
948 dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
957 dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */
966 dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */
975 sd c_3,40(a0) /* r[5]=c3; */
977 dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */
984 dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */
992 dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */
1001 dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
1010 dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */
1019 dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */
1028 dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */
1037 sd c_1,48(a0) /* r[6]=c1; */
1039 dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */
1046 dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */
1054 dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */
1063 dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */
1072 dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */
1081 dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */
1090 dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */
1099 dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */
1108 sd c_2,56(a0) /* r[7]=c2; */
1110 dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */
1117 dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */
1125 dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */
1134 dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
1143 dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */
1152 dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */
1161 dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */
1170 sd c_3,64(a0) /* r[8]=c3; */
1172 dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */
1179 dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */
1187 dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */
1196 dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */
1205 dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */
1214 dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */
1223 sd c_1,72(a0) /* r[9]=c1; */
1225 dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */
1232 dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */
1240 dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
1249 dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */
1258 dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */
1267 sd c_2,80(a0) /* r[10]=c2; */
1269 dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */
1276 dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */
1284 dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */
1293 dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */
1302 sd c_3,88(a0) /* r[11]=c3; */
1304 dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */
1311 dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
1319 dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */
1328 sd c_1,96(a0) /* r[12]=c1; */
1330 dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */
1337 dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */
1345 sd c_2,104(a0) /* r[13]=c2; */
1347 dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
1360 sd c_3,112(a0) /* r[14]=c3; */
1361 sd c_1,120(a0) /* r[15]=c1; */
1363 PTR_ADD sp,FRAME_SIZE
1375 dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
1384 dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
1390 dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
1400 dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
1407 dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
1415 dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
1426 dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
1433 dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
1441 dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
1450 dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
1461 dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
1468 dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
1476 dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
1487 dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
1494 dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
1504 dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
1534 dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
1543 dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
1556 dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
1568 dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
1579 dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
1591 dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */
1608 dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */
1620 dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
1635 dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
1646 dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */
1658 dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */
1673 dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
1690 dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */
1702 dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */
1717 dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */
1732 dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
1743 dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */
1755 dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */
1770 dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */
1785 dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */
1802 dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */
1814 dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */
1829 dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */
1844 dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
1855 dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */
1867 dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */
1882 dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */
1899 dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */
1911 dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */
1926 dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
1937 dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */
1949 dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */
1966 dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */
1978 dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
1989 dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */
2003 dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
2023 dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
2028 dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
2041 dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
2053 dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
2064 dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
2076 dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */
2093 dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
2105 dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
2116 dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
2130 dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */