1 ;;====================================================================
2 ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 ;; Rights for redistribution and usage in source and binary forms are
6 ;; granted according to the OpenSSL license. Warranty of any kind is
8 ;;====================================================================
9 ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
10 ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
11 ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
12 ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
13 ;;====================================================================
28 .global _bn_mul_add_words
35 [B0] ZERO A19 ; high part of accumulator
41 ;;====================================================================
42 LDW *ARG1++,B7 ; ap[i]
44 LDW *ARG0++,A7 ; rp[i]
46 NOP 3 ; [2,0] in epilogue
48 ADDU A19,A21:A20,A19:A18
50 SPKERNEL 2,1 ; leave slot for "return value"
51 || STW A18,*A2++ ; rp[i]
53 ;;====================================================================
55 MV A19,RET ; return value
65 [B0] ZERO A19 ; high part of accumulator
69 ;;====================================================================
70 LDW *ARG1++,A7 ; ap[i]
72 MPY32U A7,ARG3,A17:A16
73 NOP 4 ; [2,0] in epiloque
76 SPKERNEL 2,1 ; leave slot for "return value"
77 || STW A18,*ARG0++ ; rp[i]
79 ;;====================================================================
81 MV A19,RET ; return value
92 || [B0] ADD 4,ARG0,ARG0
96 ;;====================================================================
97 LDW *ARG1++,B7 ; ap[i]
100 NOP 3 ; [2,0] in epilogue
101 STW B0,*B2++(8) ; rp[2*i]
103 SPKERNEL 2,0 ; fully overlap BNOP RA,5
104 || STW A1,*ARG0++(8) ; rp[2*i+1]
105 ;;====================================================================
109 .global _bn_add_words
116 [B0] ZERO A1 ; carry flag
121 ;;====================================================================
122 LDW *ARG2++,A7 ; bp[i]
123 || LDW *ARG1++,B7 ; ap[i]
127 SPKERNEL 0,0 ; fully overlap BNOP RA,5
128 || STW A0,*A3++ ; write result
129 || MV A1,RET ; keep carry flag in RET
130 ;;====================================================================
134 .global _bn_sub_words
141 [B0] ZERO A2 ; borrow flag
146 ;;====================================================================
147 LDW *ARG2++,A7 ; bp[i]
148 || LDW *ARG1++,B7 ; ap[i]
151 [A2] SUB A1:A0,1,A1:A0
152 SPKERNEL 0,1 ; leave slot for "return borrow flag"
153 || STW A0,*A3++ ; write result
154 || AND 1,A1,A2 ; pass on borrow flag
155 ;;====================================================================
157 AND 1,A1,RET ; return borrow flag
160 .global _bn_div_words
164 CALLP __divull,A3 ; jump to rts64plus.lib
171 ;;====================================================================
172 ;; Not really Comba algorithm, just straightforward NxM... Dedicated
173 ;; fully unrolled real Comba implementations are asymptotically 2x
174 ;; faster, but naturally larger undertaking. Purpose of this exercise
175 ;; was rather to learn to master nested SPLOOPs...
176 ;;====================================================================
177 .global _bn_sqr_comba8
178 .global _bn_mul_comba8
184 || MVK 8,A0 ; M, outer loop counter
185 || MV ARG1,A5 ; copy ap
186 || MV ARG0,B4 ; copy rp
187 || ZERO B19 ; high part of accumulator
189 || SUB B0,2,B1 ; N-2, initial ILC
190 || SUB B0,1,B2 ; const B2=N-1
191 || LDW *A5++,B6 ; ap[0]
192 || MV A0,A3 ; const A3=M
193 sploopNxM?: ; for best performance arrange M<=N
194 [A0] SPLOOPD 2 ; 2*n+10
198 || LDW *A5++,A9 ; pre-fetch ap[1]
201 ;;====================================================================
202 ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
203 ;; This is because of Advisory 15 from TI publication SPRZ247I.
204 LDW *ARG2++,A7 ; bp[i]
206 [A1] LDW *B5++,B7 ; rp[i]
210 ADDU B19,B21:B20,B19:B18
213 || STW B18,*B4++ ; rp[i]
215 ;;====================================================================
216 outer?: ; m*2*(n+1)+10
217 SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
219 || CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
220 MVD A9,B6 ; move through .M unit(*)
221 [A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
222 SUBAW B5,B2,B5 ; rewind rp to rp[1]
224 [A0] BNOP.S1 outer?,4
225 || [A0] SUB.L A0,1,A0
226 STW B19,*B4--[B2] ; rewind rp tp rp[1]
227 || ZERO.S B19 ; high part of accumulator
231 ;; (*) It should be noted that B6 is used as input to MPY32U in
232 ;; chronologically next cycle in *preceding* SPLOOP iteration.
233 ;; Normally such arrangement would require DINT, but at this
234 ;; point SPLOOP is draining and interrupts are disabled
237 .global _bn_sqr_comba4
238 .global _bn_mul_comba4
245 ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
246 ;; because of read-after-write penalties, it's rather
247 ;; n*2*(n+3)+10, or 66 cycles [plus various overheads]...
249 || MVK 4,A0 ; M, outer loop counter
250 || MV ARG1,A5 ; copy ap
251 || MV ARG0,B4 ; copy rp
252 || ZERO B19 ; high part of accumulator
254 || SUB B0,2,B1 ; first ILC
255 || SUB B0,1,B2 ; const B2=N-1
256 || LDW *A5++,B6 ; ap[0]
257 || MV A0,A3 ; const A3=M
259 ;; This alternative is exercise in fully unrolled Comba
260 ;; algorithm implementation that operates at n*(n+1)+12, or
261 ;; as little as 32 cycles...
262 LDW *ARG1[0],B16 ; a[0]
263 || LDW *ARG2[0],A16 ; b[0]
264 LDW *ARG1[1],B17 ; a[1]
265 || LDW *ARG2[1],A17 ; b[1]
266 LDW *ARG1[2],B18 ; a[2]
267 || LDW *ARG2[2],A18 ; b[2]
268 LDW *ARG1[3],B19 ; a[3]
269 || LDW *ARG2[3],A19 ; b[3]
271 MPY32U A16,B16,A1:A0 ; a[0]*b[0]
272 MPY32U A17,B16,A23:A22 ; a[0]*b[1]
273 MPY32U A16,B17,A25:A24 ; a[1]*b[0]
274 MPY32U A16,B18,A27:A26 ; a[2]*b[0]
276 || MPY32U A17,B17,A29:A28 ; a[1]*b[1]
277 MPY32U A18,B16,A31:A30 ; a[0]*b[2]
280 || MPY32U A19,B16,A21:A20 ; a[3]*b[0]
281 || ADDU A24,A1:A0,A1:A0
284 || MPY32U A18,B17,A23:A22 ; a[2]*b[1]
287 || MPY32U A17,B18,A25:A24 ; a[1]*b[2]
288 || ADDU A28,A9:A8,A9:A8
290 || MPY32U A16,B19,A27:A26 ; a[0]*b[3]
291 || ADDU A30,A9:A8,A9:A8
293 || ADDU B0,A9:A8,A9:A8
297 || MPY32U A19,B17,A21:A20 ; a[3]*b[1]
298 || ADDU A22,A1:A0,A1:A0
300 || MPY32U A18,B18,A23:A22 ; a[2]*b[2]
301 || ADDU A24,A1:A0,A1:A0
303 || MPY32U A17,B19,A25:A24 ; a[1]*b[3]
304 || ADDU A26,A1:A0,A1:A0
306 || ADDU B8,A1:A0,A1:A0
308 || MPY32U A19,B18,A27:A26 ; a[3]*b[2]
311 || MPY32U A18,B19,A29:A28 ; a[2]*b[3]
312 || ADDU A22,A9:A8,A9:A8
314 || MPY32U A19,B19,A31:A30 ; a[3]*b[3]
315 || ADDU A24,A9:A8,A9:A8
317 || ADDU B0,A9:A8,A9:A8
321 || ADDU A28,A1:A0,A1:A0
324 || ADDU B8,A1:A0,A1:A0
328 ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below