2 .asciiz "mips3.s, Version 1.0 (prerelease)"
3 .asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
6 * ====================================================================
7 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
10 * Rights for redistribution and usage in source and binary forms are
11 * granted according to the OpenSSL license. Warranty of any kind is
13 * ====================================================================
17 * This is my modest contributon to the OpenSSL project (see
18 * http://www.openssl.org/ for more information about it) and is
19 * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
20 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
22 * The module is designed to work with "new" IRIX ABI(5), namely
23 * N32 and N64. But it was tested only with MIPSpro 7.2.x assembler,
24 * i.e. depends on preprocessor options set up by MIPSspro 7.2.x
25 * driver. Another neat gadget offered by MIPSpro 7.2.x assembler is
26 * an peep-hole(?) optimization pass. This gave me the opportunity
27 * to make the code looking more regular as all those architecture
28 * dependent(!) instruction rescheduling details were left to the
29 * assembler. Cool, huh? Do note that I have no idea if GNU assembler
30 * does anything similar nor how GNU C will do with this module.
31 * Feedback on the matter is therefore very much appreciated:-)
33 * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
34 * exhibits 3-3.5-3.7 times improvement!
36 * <appro@fy.chalmers.se>
42 #define MOVNZ(cond,dst,src) \
45 #define MOVNZ(cond,dst,src) \
59 LEAF(bn_mul_add_words)
62 bgtzl a2,.L_bn_mul_add_words_proceed
68 .L_bn_mul_add_words_proceed:
72 beqz ta0,.L_bn_mul_add_words_tail
74 .L_bn_mul_add_words_loop:
82 sltu v0,t1,v0 /* All manuals say it "compares 32-bit
83 * values", but it seems to work fine
84 * even on 64-bit registers. */
133 bgtzl ta0,.L_bn_mul_add_words_loop
136 bnezl a2,.L_bn_mul_add_words_tail
140 .L_bn_mul_add_words_return:
143 .L_bn_mul_add_words_tail:
156 beqz a2,.L_bn_mul_add_words_return
171 beqz a2,.L_bn_mul_add_words_return
186 END(bn_mul_add_words)
191 bgtzl a2,.L_bn_mul_words_proceed
197 .L_bn_mul_words_proceed:
201 beqz ta0,.L_bn_mul_words_tail
203 .L_bn_mul_words_loop:
244 bgtzl ta0,.L_bn_mul_words_loop
247 bnezl a2,.L_bn_mul_words_tail
251 .L_bn_mul_words_return:
254 .L_bn_mul_words_tail:
263 beqz a2,.L_bn_mul_words_return
274 beqz a2,.L_bn_mul_words_return
290 bgtzl a2,.L_bn_sqr_words_proceed
296 .L_bn_sqr_words_proceed:
300 beqz ta0,.L_bn_sqr_words_tail
302 .L_bn_sqr_words_loop:
336 bgtzl ta0,.L_bn_sqr_words_loop
339 bnezl a2,.L_bn_sqr_words_tail
343 .L_bn_sqr_words_return:
347 .L_bn_sqr_words_tail:
354 beqz a2,.L_bn_sqr_words_return
363 beqz a2,.L_bn_sqr_words_return
377 bgtzl a3,.L_bn_add_words_proceed
383 .L_bn_add_words_proceed:
387 beqz AT,.L_bn_add_words_tail
389 .L_bn_add_words_loop:
431 bgtzl AT,.L_bn_add_words_loop
434 bnezl a3,.L_bn_add_words_tail
438 .L_bn_add_words_return:
441 .L_bn_add_words_tail:
450 beqz a3,.L_bn_add_words_return
461 beqz a3,.L_bn_add_words_return
477 bgtzl a3,.L_bn_sub_words_proceed
483 .L_bn_sub_words_proceed:
487 beqz AT,.L_bn_sub_words_tail
489 .L_bn_sub_words_loop:
528 bgtzl AT,.L_bn_sub_words_loop
531 bnezl a3,.L_bn_sub_words_tail
535 .L_bn_sub_words_return:
538 .L_bn_sub_words_tail:
546 beqz a3,.L_bn_sub_words_return
556 beqz a3,.L_bn_sub_words_return
573 bnezl a2,.L_bn_div_words_proceed
576 li v0,-1 /* I'd rather signal div-by-zero
577 * which can be done with 'break 7' */
580 .L_bn_div_words_proceed:
581 bltz a2,.L_bn_div_words_body
594 break 6 /* signal overflow */
604 .L_bn_div_words_body:
614 .L_bn_div_words_outer_loop:
617 dsrl QT,MINUS1,32 /* q=0xffffffff */
618 beq DH,HH,.L_bn_div_words_inner_loop
621 .L_bn_div_words_inner_loop:
634 bnezl AT,.L_bn_div_words_inner_loop
639 beqz v1,.L_bn_div_words_outer_loop_done
643 b .L_bn_div_words_outer_loop
645 .L_bn_div_words_outer_loop_done:
647 move v1,a0 /* v1 contains remainder if one wants it */
667 #define a_7 a1 /* once we load a[7] we don't need a anymore */
671 #define b_7 a2 /* once we load b[7] we don't need b anymore */
680 #define FRAME_SIZE 48
685 PTR_SUB sp,FRAME_SIZE
688 ld a_0,0(a1) /* If compiled with -mips3 options
689 * assembler barks on this line with
690 * "shouldn't have mult/div as last
691 * instruction in bb (R10K bug)"
692 * warning. If anybody out there has
693 * a clue on what does "bb" mean and
694 * how to circumvent this do send me
696 * <appro@fy.chalmers.se>
705 dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
715 dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
727 dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
730 sd c_1,0(a0) /* r[0]=c1; */
738 sd c_2,8(a0) /* r[1]=c2; */
740 dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
747 dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
755 dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
764 sd c_3,16(a0) /* r[2]=c3; */
766 dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
773 dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
781 dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
790 dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
799 sd c_1,24(a0) /* r[3]=c1; */
801 dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */
808 dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
816 dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
825 dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
834 dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */
843 sd c_2,32(a0) /* r[4]=c2; */
845 dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */
852 dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */
860 dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
869 dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
878 dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */
887 dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */
896 sd c_3,40(a0) /* r[5]=c3; */
898 dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */
905 dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */
913 dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */
922 dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
931 dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */
940 dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */
949 dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */
958 sd c_1,48(a0) /* r[6]=c1; */
960 dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */
967 dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */
975 dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */
984 dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */
993 dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */
1002 dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */
1011 dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */
1020 dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */
1029 sd c_2,56(a0) /* r[7]=c2; */
1031 dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */
1038 dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */
1046 dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */
1055 dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
1064 dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */
1073 dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */
1082 dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */
1091 sd c_3,64(a0) /* r[8]=c3; */
1093 dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */
1100 dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */
1108 dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */
1117 dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */
1126 dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */
1135 dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */
1144 sd c_1,72(a0) /* r[9]=c1; */
1146 dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */
1153 dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */
1161 dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
1170 dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */
1179 dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */
1188 sd c_2,80(a0) /* r[10]=c2; */
1190 dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */
1197 dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */
1205 dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */
1214 dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */
1223 sd c_3,88(a0) /* r[11]=c3; */
1225 dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */
1232 dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
1240 dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */
1249 sd c_1,96(a0) /* r[12]=c1; */
1251 dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */
1258 dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */
1266 sd c_2,104(a0) /* r[13]=c2; */
1268 dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
1281 sd c_3,112(a0) /* r[14]=c3; */
1282 sd c_1,120(a0) /* r[15]=c1; */
1284 PTR_ADD sp,FRAME_SIZE
1296 dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
1305 dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
1311 dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
1321 dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
1328 dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
1336 dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
1347 dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
1354 dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
1362 dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
1371 dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
1382 dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
1389 dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
1397 dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
1408 dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
1415 dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
1425 dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
1455 dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
1464 dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
1477 dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
1489 dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
1500 dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
1512 dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */
1529 dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */
1541 dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
1556 dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
1567 dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */
1579 dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */
1594 dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
1611 dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */
1623 dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */
1638 dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */
1653 dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
1664 dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */
1676 dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */
1691 dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */
1706 dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */
1723 dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */
1735 dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */
1750 dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */
1765 dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
1776 dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */
1788 dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */
1803 dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */
1820 dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */
1832 dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */
1847 dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
1858 dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */
1870 dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */
1887 dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */
1899 dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
1910 dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */
1924 dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
1944 dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
1949 dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
1962 dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
1974 dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
1985 dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
1997 dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */
2014 dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
2026 dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
2037 dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
2051 dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */