;;==================================================================== ;; Written by Andy Polyakov for the OpenSSL ;; project. ;; ;; Rights for redistribution and usage in source and binary forms are ;; granted according to the OpenSSL license. Warranty of any kind is ;; disclaimed. ;;==================================================================== ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler ;; SPLOOPs spin at ... 2*n cycles [plus epilogue]. ;;==================================================================== .text .if __TI_EABI__ .asg bn_mul_add_words,_bn_mul_add_words .asg bn_mul_words,_bn_mul_words .asg bn_sqr_words,_bn_sqr_words .asg bn_add_words,_bn_add_words .asg bn_sub_words,_bn_sub_words .asg bn_div_words,_bn_div_words .asg bn_sqr_comba8,_bn_sqr_comba8 .asg bn_mul_comba8,_bn_mul_comba8 .asg bn_sqr_comba4,_bn_sqr_comba4 .asg bn_mul_comba4,_bn_mul_comba4 .endif .asg B3,RA .asg A4,ARG0 .asg B4,ARG1 .asg A6,ARG2 .asg B6,ARG3 .asg A8,ARG4 .asg B8,ARG5 .asg A4,RET .asg A15,FP .asg B14,DP .asg B15,SP .global _bn_mul_add_words _bn_mul_add_words: .asmfunc MV ARG2,B0 [!B0] BNOP RA ||[!B0] MVK 0,RET [B0] MVC B0,ILC [B0] ZERO A19 ; high part of accumulator || [B0] MV ARG0,A2 || [B0] MV ARG3,A3 NOP 3 SPLOOP 2 ; 2*n+10 ;;==================================================================== LDW *ARG1++,B7 ; ap[i] NOP 3 LDW *ARG0++,A7 ; rp[i] MPY32U B7,A3,A17:A16 NOP 3 ; [2,0] in epilogue ADDU A16,A7,A21:A20 ADDU A19,A21:A20,A19:A18 || MV.S A17,A23 SPKERNEL 2,1 ; leave slot for "return value" || STW A18,*A2++ ; rp[i] || ADD A19,A23,A19 ;;==================================================================== BNOP RA,4 MV A19,RET ; return value .endasmfunc .global _bn_mul_words _bn_mul_words: .asmfunc MV ARG2,B0 [!B0] BNOP RA ||[!B0] MVK 0,RET [B0] MVC B0,ILC [B0] ZERO A19 ; high part of accumulator NOP 3 SPLOOP 2 ; 2*n+10 ;;==================================================================== LDW *ARG1++,A7 ; ap[i] NOP 4 MPY32U A7,ARG3,A17:A16 NOP 4 ; [2,0] in epiloque ADDU A19,A16,A19:A18 || MV.S A17,A21 SPKERNEL 2,1 ; leave slot for "return value" || STW A18,*ARG0++ ; rp[i] || ADD.L A19,A21,A19 ;;==================================================================== BNOP RA,4 MV A19,RET ; return value .endasmfunc .global _bn_sqr_words _bn_sqr_words: .asmfunc MV ARG2,B0 [!B0] BNOP RA ||[!B0] MVK 0,RET [B0] MVC B0,ILC [B0] MV ARG0,B2 || [B0] ADD 4,ARG0,ARG0 NOP 3 SPLOOP 2 ; 2*n+10 ;;==================================================================== LDW *ARG1++,B7 ; ap[i] NOP 4 MPY32U B7,B7,B1:B0 NOP 3 ; [2,0] in epilogue STW B0,*B2++(8) ; rp[2*i] MV B1,A1 SPKERNEL 2,0 ; fully overlap BNOP RA,5 || STW A1,*ARG0++(8) ; rp[2*i+1] ;;==================================================================== BNOP RA,5 .endasmfunc .global _bn_add_words _bn_add_words: .asmfunc MV ARG3,B0 [!B0] BNOP RA ||[!B0] MVK 0,RET [B0] MVC B0,ILC [B0] ZERO A1 ; carry flag || [B0] MV ARG0,A3 NOP 3 SPLOOP 2 ; 2*n+6 ;;==================================================================== LDW *ARG2++,A7 ; bp[i] || LDW *ARG1++,B7 ; ap[i] NOP 4 ADDU A7,B7,A9:A8 ADDU A1,A9:A8,A1:A0 SPKERNEL 0,0 ; fully overlap BNOP RA,5 || STW A0,*A3++ ; write result || MV A1,RET ; keep carry flag in RET ;;==================================================================== BNOP RA,5 .endasmfunc .global _bn_sub_words _bn_sub_words: .asmfunc MV ARG3,B0 [!B0] BNOP RA ||[!B0] MVK 0,RET [B0] MVC B0,ILC [B0] ZERO A2 ; borrow flag || [B0] MV ARG0,A3 NOP 3 SPLOOP 2 ; 2*n+6 ;;==================================================================== LDW *ARG2++,A7 ; bp[i] || LDW *ARG1++,B7 ; ap[i] NOP 4 SUBU B7,A7,A1:A0 [A2] SUB A1:A0,1,A1:A0 SPKERNEL 0,1 ; leave slot for "return borrow flag" || STW A0,*A3++ ; write result || AND 1,A1,A2 ; pass on borrow flag ;;==================================================================== BNOP RA,4 AND 1,A1,RET ; return borrow flag .endasmfunc .global _bn_div_words _bn_div_words: .asmfunc LMBD 1,A6,A0 ; leading zero bits in dv LMBD 1,A4,A1 ; leading zero bits in hi || MVK 32,B0 CMPLTU A1,A0,A2 || ADD A0,B0,B0 [ A2] BNOP RA ||[ A2] MVK -1,A4 ; return overflow ||[!A2] MV A4,A3 ; reassign hi [!A2] MV B4,A4 ; reassign lo, will be quotient ||[!A2] MVC B0,ILC [!A2] SHL A6,A0,A6 ; normalize dv || MVK 1,A1 [!A2] CMPLTU A3,A6,A1 ; hi>31 SPLOOP 3 [!A1] CMPLTU A3,A6,A1 ; hi>31 SPKERNEL BNOP RA,5 .endasmfunc ;;==================================================================== ;; Not really Comba algorithm, just straightforward NxM... Dedicated ;; fully unrolled real Comba implementations are asymptotically 2x ;; faster, but naturally larger undertaking. Purpose of this exercise ;; was rather to learn to master nested SPLOOPs... ;;==================================================================== .global _bn_sqr_comba8 .global _bn_mul_comba8 _bn_sqr_comba8: MV ARG1,ARG2 _bn_mul_comba8: .asmfunc MVK 8,B0 ; N, RILC || MVK 8,A0 ; M, outer loop counter || MV ARG1,A5 ; copy ap || MV ARG0,B4 ; copy rp || ZERO B19 ; high part of accumulator MVC B0,RILC || SUB B0,2,B1 ; N-2, initial ILC || SUB B0,1,B2 ; const B2=N-1 || LDW *A5++,B6 ; ap[0] || MV A0,A3 ; const A3=M sploopNxM?: ; for best performance arrange M<=N [A0] SPLOOPD 2 ; 2*n+10 || MVC B1,ILC || ADDAW B4,B0,B5 || ZERO B7 || LDW *A5++,A9 ; pre-fetch ap[1] || ZERO A1 || SUB A0,1,A0 ;;==================================================================== ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files. ;; This is because of Advisory 15 from TI publication SPRZ247I. LDW *ARG2++,A7 ; bp[i] NOP 3 [A1] LDW *B5++,B7 ; rp[i] MPY32U A7,B6,B17:B16 NOP 3 ADDU B16,B7,B21:B20 ADDU B19,B21:B20,B19:B18 || MV.S B17,B23 SPKERNEL || STW B18,*B4++ ; rp[i] || ADD.S B19,B23,B19 ;;==================================================================== outer?: ; m*2*(n+1)+10 SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0] SPMASKR || CMPGT A0,1,A2 ; done pre-fetching ap[i+1]? MVD A9,B6 ; move through .M unit(*) [A2] LDW *A5++,A9 ; pre-fetch ap[i+1] SUBAW B5,B2,B5 ; rewind rp to rp[1] MVK 1,A1 [A0] BNOP.S1 outer?,4 || [A0] SUB.L A0,1,A0 STW B19,*B4--[B2] ; rewind rp tp rp[1] || ZERO.S B19 ; high part of accumulator ;; end of outer? BNOP RA,5 ; return .endasmfunc ;; (*) It should be noted that B6 is used as input to MPY32U in ;; chronologically next cycle in *preceding* SPLOOP iteration. ;; Normally such arrangement would require DINT, but at this ;; point SPLOOP is draining and interrupts are disabled ;; implicitly. .global _bn_sqr_comba4 .global _bn_mul_comba4 _bn_sqr_comba4: MV ARG1,ARG2 _bn_mul_comba4: .asmfunc .if 0 BNOP sploopNxM?,3 ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case, ;; because of read-after-write penalties, it's rather ;; n*2*(n+3)+10, or 66 cycles [plus various overheads]... MVK 4,B0 ; N, RILC || MVK 4,A0 ; M, outer loop counter || MV ARG1,A5 ; copy ap || MV ARG0,B4 ; copy rp || ZERO B19 ; high part of accumulator MVC B0,RILC || SUB B0,2,B1 ; first ILC || SUB B0,1,B2 ; const B2=N-1 || LDW *A5++,B6 ; ap[0] || MV A0,A3 ; const A3=M .else ;; This alternative is an exercise in fully unrolled Comba ;; algorithm implementation that operates at n*(n+1)+12, or ;; as little as 32 cycles... LDW *ARG1[0],B16 ; a[0] || LDW *ARG2[0],A16 ; b[0] LDW *ARG1[1],B17 ; a[1] || LDW *ARG2[1],A17 ; b[1] LDW *ARG1[2],B18 ; a[2] || LDW *ARG2[2],A18 ; b[2] LDW *ARG1[3],B19 ; a[3] || LDW *ARG2[3],A19 ; b[3] NOP MPY32U A16,B16,A1:A0 ; a[0]*b[0] MPY32U A17,B16,A23:A22 ; a[0]*b[1] MPY32U A16,B17,A25:A24 ; a[1]*b[0] MPY32U A16,B18,A27:A26 ; a[2]*b[0] STW A0,*ARG0[0] || MPY32U A17,B17,A29:A28 ; a[1]*b[1] MPY32U A18,B16,A31:A30 ; a[0]*b[2] || ADDU A22,A1,A1:A0 MV A23,B0 || MPY32U A19,B16,A21:A20 ; a[3]*b[0] || ADDU A24,A1:A0,A1:A0 ADDU A25,B0,B1:B0 || STW A0,*ARG0[1] || MPY32U A18,B17,A23:A22 ; a[2]*b[1] || ADDU A26,A1,A9:A8 ADDU A27,B1,B9:B8 || MPY32U A17,B18,A25:A24 ; a[1]*b[2] || ADDU A28,A9:A8,A9:A8 ADDU A29,B9:B8,B9:B8 || MPY32U A16,B19,A27:A26 ; a[0]*b[3] || ADDU A30,A9:A8,A9:A8 ADDU A31,B9:B8,B9:B8 || ADDU B0,A9:A8,A9:A8 STW A8,*ARG0[2] || ADDU A20,A9,A1:A0 ADDU A21,B9,B1:B0 || MPY32U A19,B17,A21:A20 ; a[3]*b[1] || ADDU A22,A1:A0,A1:A0 ADDU A23,B1:B0,B1:B0 || MPY32U A18,B18,A23:A22 ; a[2]*b[2] || ADDU A24,A1:A0,A1:A0 ADDU A25,B1:B0,B1:B0 || MPY32U A17,B19,A25:A24 ; a[1]*b[3] || ADDU A26,A1:A0,A1:A0 ADDU A27,B1:B0,B1:B0 || ADDU B8,A1:A0,A1:A0 STW A0,*ARG0[3] || MPY32U A19,B18,A27:A26 ; a[3]*b[2] || ADDU A20,A1,A9:A8 ADDU A21,B1,B9:B8 || MPY32U A18,B19,A29:A28 ; a[2]*b[3] || ADDU A22,A9:A8,A9:A8 ADDU A23,B9:B8,B9:B8 || MPY32U A19,B19,A31:A30 ; a[3]*b[3] || ADDU A24,A9:A8,A9:A8 ADDU A25,B9:B8,B9:B8 || ADDU B0,A9:A8,A9:A8 STW A8,*ARG0[4] || ADDU A26,A9,A1:A0 ADDU A27,B9,B1:B0 || ADDU A28,A1:A0,A1:A0 ADDU A29,B1:B0,B1:B0 || BNOP RA || ADDU B8,A1:A0,A1:A0 STW A0,*ARG0[5] || ADDU A30,A1,A9:A8 ADD A31,B1,B8 ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below ADD B8,A9,A9 || STW A8,*ARG0[6] STW A9,*ARG0[7] .endif .endasmfunc