crypto/bn/asm/bn-c64xplus.asm

   1 ;;====================================================================
   2 ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   3 ;; project.
   4 ;;
   5 ;; Rights for redistribution and usage in source and binary forms are
   6 ;; granted according to the OpenSSL license. Warranty of any kind is
   7 ;; disclaimed.
   8 ;;====================================================================
   9 ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
  10 ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
  11 ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
  12 ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
  13 ;;====================================================================
  14         .text
  15
  16         .if     .ASSEMBLER_VERSION<7000000
  17         .asg    0,__TI_EABI__
  18         .endif
  19         .if     __TI_EABI__
  20         .asg    bn_mul_add_words,_bn_mul_add_words
  21         .asg    bn_mul_words,_bn_mul_words
  22         .asg    bn_sqr_words,_bn_sqr_words
  23         .asg    bn_add_words,_bn_add_words
  24         .asg    bn_sub_words,_bn_sub_words
  25         .asg    bn_div_words,_bn_div_words
  26         .asg    bn_sqr_comba8,_bn_sqr_comba8
  27         .asg    bn_mul_comba8,_bn_mul_comba8
  28         .asg    bn_sqr_comba4,_bn_sqr_comba4
  29         .asg    bn_mul_comba4,_bn_mul_comba4
  30         .endif
  31
  32         .asg    B3,RA
  33         .asg    A4,ARG0
  34         .asg    B4,ARG1
  35         .asg    A6,ARG2
  36         .asg    B6,ARG3
  37         .asg    A8,ARG4
  38         .asg    B8,ARG5
  39         .asg    A4,RET
  40         .asg    A15,FP
  41         .asg    B14,DP
  42         .asg    B15,SP
  43
  44         .global _bn_mul_add_words
  45 _bn_mul_add_words:
  46         .asmfunc
  47         MV      ARG2,B0
  48   [!B0] BNOP    RA
  49 ||[!B0] MVK     0,RET
  50    [B0] MVC     B0,ILC
  51    [B0] ZERO    A19             ; high part of accumulator
  52 || [B0] MV      ARG0,A2
  53 || [B0] MV      ARG3,A3
  54         NOP     3
  55
  56         SPLOOP  2               ; 2*n+10
  57 ;;====================================================================
  58         LDW     *ARG1++,B7      ; ap[i]
  59         NOP     3
  60         LDW     *ARG0++,A7      ; rp[i]
  61         MPY32U  B7,A3,A17:A16
  62         NOP     3               ; [2,0] in epilogue
  63         ADDU    A16,A7,A21:A20
  64         ADDU    A19,A21:A20,A19:A18
  65 ||      MV.S    A17,A23
  66         SPKERNEL 2,1            ; leave slot for "return value"
  67 ||      STW     A18,*A2++       ; rp[i]
  68 ||      ADD     A19,A23,A19
  69 ;;====================================================================
  70         BNOP    RA,4
  71         MV      A19,RET         ; return value
  72         .endasmfunc
  73
  74         .global _bn_mul_words
  75 _bn_mul_words:
  76         .asmfunc
  77         MV      ARG2,B0
  78   [!B0] BNOP    RA
  79 ||[!B0] MVK     0,RET
  80    [B0] MVC     B0,ILC
  81    [B0] ZERO    A19             ; high part of accumulator
  82         NOP     3
  83
  84         SPLOOP  2               ; 2*n+10
  85 ;;====================================================================
  86         LDW     *ARG1++,A7      ; ap[i]
  87         NOP     4
  88         MPY32U  A7,ARG3,A17:A16
  89         NOP     4               ; [2,0] in epiloque
  90         ADDU    A19,A16,A19:A18
  91 ||      MV.S    A17,A21
  92         SPKERNEL 2,1            ; leave slot for "return value"
  93 ||      STW     A18,*ARG0++     ; rp[i]
  94 ||      ADD.L   A19,A21,A19
  95 ;;====================================================================
  96         BNOP    RA,4
  97         MV      A19,RET         ; return value
  98         .endasmfunc
  99
 100         .global _bn_sqr_words
 101 _bn_sqr_words:
 102         .asmfunc
 103         MV      ARG2,B0
 104   [!B0] BNOP    RA
 105 ||[!B0] MVK     0,RET
 106    [B0] MVC     B0,ILC
 107    [B0] MV      ARG0,B2
 108 || [B0] ADD     4,ARG0,ARG0
 109         NOP     3
 110
 111         SPLOOP  2               ; 2*n+10
 112 ;;====================================================================
 113         LDW     *ARG1++,B7      ; ap[i]
 114         NOP     4
 115         MPY32U  B7,B7,B1:B0
 116         NOP     3               ; [2,0] in epilogue
 117         STW     B0,*B2++(8)     ; rp[2*i]
 118         MV      B1,A1
 119         SPKERNEL 2,0            ; fully overlap BNOP RA,5
 120 ||      STW     A1,*ARG0++(8)   ; rp[2*i+1]
 121 ;;====================================================================
 122         BNOP    RA,5
 123         .endasmfunc
 124
 125         .global _bn_add_words
 126 _bn_add_words:
 127         .asmfunc
 128         MV      ARG3,B0
 129   [!B0] BNOP    RA
 130 ||[!B0] MVK     0,RET
 131    [B0] MVC     B0,ILC
 132    [B0] ZERO    A1              ; carry flag
 133 || [B0] MV      ARG0,A3
 134         NOP     3
 135
 136         SPLOOP  2               ; 2*n+6
 137 ;;====================================================================
 138         LDW     *ARG2++,A7      ; bp[i]
 139 ||      LDW     *ARG1++,B7      ; ap[i]
 140         NOP     4
 141         ADDU    A7,B7,A9:A8
 142         ADDU    A1,A9:A8,A1:A0
 143         SPKERNEL 0,0            ; fully overlap BNOP RA,5
 144 ||      STW     A0,*A3++        ; write result
 145 ||      MV      A1,RET          ; keep carry flag in RET
 146 ;;====================================================================
 147         BNOP    RA,5
 148         .endasmfunc
 149
 150         .global _bn_sub_words
 151 _bn_sub_words:
 152         .asmfunc
 153         MV      ARG3,B0
 154   [!B0] BNOP    RA
 155 ||[!B0] MVK     0,RET
 156    [B0] MVC     B0,ILC
 157    [B0] ZERO    A2              ; borrow flag
 158 || [B0] MV      ARG0,A3
 159         NOP     3
 160
 161         SPLOOP  2               ; 2*n+6
 162 ;;====================================================================
 163         LDW     *ARG2++,A7      ; bp[i]
 164 ||      LDW     *ARG1++,B7      ; ap[i]
 165         NOP     4
 166         SUBU    B7,A7,A1:A0
 167   [A2]  SUB     A1:A0,1,A1:A0
 168         SPKERNEL 0,1            ; leave slot for "return borrow flag"
 169 ||      STW     A0,*A3++        ; write result
 170 ||      AND     1,A1,A2         ; pass on borrow flag
 171 ;;====================================================================
 172         BNOP    RA,4
 173         AND     1,A1,RET        ; return borrow flag
 174         .endasmfunc
 175
 176         .global _bn_div_words
 177 _bn_div_words:
 178         .asmfunc
 179         LMBD    1,A6,A0         ; leading zero bits in dv
 180         LMBD    1,A4,A1         ; leading zero bits in hi
 181 ||      MVK     32,B0
 182         CMPLTU  A1,A0,A2
 183 ||      ADD     A0,B0,B0
 184   [ A2] BNOP    RA
 185 ||[ A2] MVK     -1,A4           ; return overflow
 186 ||[!A2] MV      A4,A3           ; reassign hi
 187   [!A2] MV      B4,A4           ; reassign lo, will be quotient
 188 ||[!A2] MVC     B0,ILC
 189   [!A2] SHL     A6,A0,A6        ; normalize dv
 190 ||      MVK     1,A1
 191
 192   [!A2] CMPLTU  A3,A6,A1        ; hi<dv?
 193 ||[!A2] SHL     A4,1,A5:A4      ; lo<<1
 194   [!A1] SUB     A3,A6,A3        ; hi-=dv
 195 ||[!A1] OR      1,A4,A4
 196   [!A2] SHRU    A3,31,A1        ; upper bit
 197 ||[!A2] ADDAH   A5,A3,A3        ; hi<<1|lo>>31
 198
 199         SPLOOP  3
 200   [!A1] CMPLTU  A3,A6,A1        ; hi<dv?
 201 ||[ A1] ZERO    A1
 202 ||      SHL     A4,1,A5:A4      ; lo<<1
 203   [!A1] SUB     A3,A6,A3        ; hi-=dv
 204 ||[!A1] OR      1,A4,A4         ; quotient
 205         SHRU    A3,31,A1        ; upper bit
 206 ||      ADDAH   A5,A3,A3        ; hi<<1|lo>>31
 207         SPKERNEL
 208
 209         BNOP    RA,5
 210         .endasmfunc
 211
 212 ;;====================================================================
 213 ;; Not really Comba algorithm, just straightforward NxM... Dedicated
 214 ;; fully unrolled real Comba implementations are asymptotically 2x
 215 ;; faster, but naturally larger undertaking. Purpose of this exercise
 216 ;; was rather to learn to master nested SPLOOPs...
 217 ;;====================================================================
 218         .global _bn_sqr_comba8
 219         .global _bn_mul_comba8
 220 _bn_sqr_comba8:
 221         MV      ARG1,ARG2
 222 _bn_mul_comba8:
 223         .asmfunc
 224         MVK     8,B0            ; N, RILC
 225 ||      MVK     8,A0            ; M, outer loop counter
 226 ||      MV      ARG1,A5         ; copy ap
 227 ||      MV      ARG0,B4         ; copy rp
 228 ||      ZERO    B19             ; high part of accumulator
 229         MVC     B0,RILC
 230 ||      SUB     B0,2,B1         ; N-2, initial ILC
 231 ||      SUB     B0,1,B2         ; const B2=N-1
 232 ||      LDW     *A5++,B6        ; ap[0]
 233 ||      MV      A0,A3           ; const A3=M
 234 sploopNxM?:                     ; for best performance arrange M<=N
 235    [A0] SPLOOPD 2               ; 2*n+10
 236 ||      MVC     B1,ILC
 237 ||      ADDAW   B4,B0,B5
 238 ||      ZERO    B7
 239 ||      LDW     *A5++,A9        ; pre-fetch ap[1]
 240 ||      ZERO    A1
 241 ||      SUB     A0,1,A0
 242 ;;====================================================================
 243 ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
 244 ;; This is because of Advisory 15 from TI publication SPRZ247I.
 245         LDW     *ARG2++,A7      ; bp[i]
 246         NOP     3
 247    [A1] LDW     *B5++,B7        ; rp[i]
 248         MPY32U  A7,B6,B17:B16
 249         NOP     3
 250         ADDU    B16,B7,B21:B20
 251         ADDU    B19,B21:B20,B19:B18
 252 ||      MV.S    B17,B23
 253         SPKERNEL
 254 ||      STW     B18,*B4++       ; rp[i]
 255 ||      ADD.S   B19,B23,B19
 256 ;;====================================================================
 257 outer?:                         ; m*2*(n+1)+10
 258         SUBAW   ARG2,A3,ARG2    ; rewind bp to bp[0]
 259         SPMASKR
 260 ||      CMPGT   A0,1,A2         ; done pre-fetching ap[i+1]?
 261         MVD     A9,B6           ; move through .M unit(*)
 262    [A2] LDW     *A5++,A9        ; pre-fetch ap[i+1]
 263         SUBAW   B5,B2,B5        ; rewind rp to rp[1]
 264         MVK     1,A1
 265    [A0] BNOP.S1 outer?,4
 266 || [A0] SUB.L   A0,1,A0
 267         STW     B19,*B4--[B2]   ; rewind rp tp rp[1]
 268 ||      ZERO.S  B19             ; high part of accumulator
 269 ;; end of outer?
 270         BNOP    RA,5            ; return
 271         .endasmfunc
 272 ;; (*)  It should be noted that B6 is used as input to MPY32U in
 273 ;;      chronologically next cycle in *preceding* SPLOOP iteration.
 274 ;;      Normally such arrangement would require DINT, but at this
 275 ;;      point SPLOOP is draining and interrupts are disabled
 276 ;;      implicitly.
 277
 278         .global _bn_sqr_comba4
 279         .global _bn_mul_comba4
 280 _bn_sqr_comba4:
 281         MV      ARG1,ARG2
 282 _bn_mul_comba4:
 283         .asmfunc
 284         .if     0
 285         BNOP    sploopNxM?,3
 286         ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
 287         ;; because of low-counter effect, when prologue phase finishes
 288         ;; before SPKERNEL instruction is reached. As result it's 25%
 289         ;; slower than expected...
 290         MVK     4,B0            ; N, RILC
 291 ||      MVK     4,A0            ; M, outer loop counter
 292 ||      MV      ARG1,A5         ; copy ap
 293 ||      MV      ARG0,B4         ; copy rp
 294 ||      ZERO    B19             ; high part of accumulator
 295         MVC     B0,RILC
 296 ||      SUB     B0,2,B1         ; first ILC
 297 ||      SUB     B0,1,B2         ; const B2=N-1
 298 ||      LDW     *A5++,B6        ; ap[0]
 299 ||      MV      A0,A3           ; const A3=M
 300         .else
 301         ;; This alternative is an exercise in fully unrolled Comba
 302         ;; algorithm implementation that operates at n*(n+1)+12, or
 303         ;; as little as 32 cycles...
 304         LDW     *ARG1[0],B16    ; a[0]
 305 ||      LDW     *ARG2[0],A16    ; b[0]
 306         LDW     *ARG1[1],B17    ; a[1]
 307 ||      LDW     *ARG2[1],A17    ; b[1]
 308         LDW     *ARG1[2],B18    ; a[2]
 309 ||      LDW     *ARG2[2],A18    ; b[2]
 310         LDW     *ARG1[3],B19    ; a[3]
 311 ||      LDW     *ARG2[3],A19    ; b[3]
 312         NOP
 313         MPY32U  A16,B16,A1:A0   ; a[0]*b[0]
 314         MPY32U  A17,B16,A23:A22 ; a[0]*b[1]
 315         MPY32U  A16,B17,A25:A24 ; a[1]*b[0]
 316         MPY32U  A16,B18,A27:A26 ; a[2]*b[0]
 317         STW     A0,*ARG0[0]
 318 ||      MPY32U  A17,B17,A29:A28 ; a[1]*b[1]
 319         MPY32U  A18,B16,A31:A30 ; a[0]*b[2]
 320 ||      ADDU    A22,A1,A1:A0
 321         MV      A23,B0
 322 ||      MPY32U  A19,B16,A21:A20 ; a[3]*b[0]
 323 ||      ADDU    A24,A1:A0,A1:A0
 324         ADDU    A25,B0,B1:B0
 325 ||      STW     A0,*ARG0[1]
 326 ||      MPY32U  A18,B17,A23:A22 ; a[2]*b[1]
 327 ||      ADDU    A26,A1,A9:A8
 328         ADDU    A27,B1,B9:B8
 329 ||      MPY32U  A17,B18,A25:A24 ; a[1]*b[2]
 330 ||      ADDU    A28,A9:A8,A9:A8
 331         ADDU    A29,B9:B8,B9:B8
 332 ||      MPY32U  A16,B19,A27:A26 ; a[0]*b[3]
 333 ||      ADDU    A30,A9:A8,A9:A8
 334         ADDU    A31,B9:B8,B9:B8
 335 ||      ADDU    B0,A9:A8,A9:A8
 336         STW     A8,*ARG0[2]
 337 ||      ADDU    A20,A9,A1:A0
 338         ADDU    A21,B9,B1:B0
 339 ||      MPY32U  A19,B17,A21:A20 ; a[3]*b[1]
 340 ||      ADDU    A22,A1:A0,A1:A0
 341         ADDU    A23,B1:B0,B1:B0
 342 ||      MPY32U  A18,B18,A23:A22 ; a[2]*b[2]
 343 ||      ADDU    A24,A1:A0,A1:A0
 344         ADDU    A25,B1:B0,B1:B0
 345 ||      MPY32U  A17,B19,A25:A24 ; a[1]*b[3]
 346 ||      ADDU    A26,A1:A0,A1:A0
 347         ADDU    A27,B1:B0,B1:B0
 348 ||      ADDU    B8,A1:A0,A1:A0
 349         STW     A0,*ARG0[3]
 350 ||      MPY32U  A19,B18,A27:A26 ; a[3]*b[2]
 351 ||      ADDU    A20,A1,A9:A8
 352         ADDU    A21,B1,B9:B8
 353 ||      MPY32U  A18,B19,A29:A28 ; a[2]*b[3]
 354 ||      ADDU    A22,A9:A8,A9:A8
 355         ADDU    A23,B9:B8,B9:B8
 356 ||      MPY32U  A19,B19,A31:A30 ; a[3]*b[3]
 357 ||      ADDU    A24,A9:A8,A9:A8
 358         ADDU    A25,B9:B8,B9:B8
 359 ||      ADDU    B0,A9:A8,A9:A8
 360         STW     A8,*ARG0[4]
 361 ||      ADDU    A26,A9,A1:A0
 362         ADDU    A27,B9,B1:B0
 363 ||      ADDU    A28,A1:A0,A1:A0
 364         ADDU    A29,B1:B0,B1:B0
 365 ||      BNOP    RA
 366 ||      ADDU    B8,A1:A0,A1:A0
 367         STW     A0,*ARG0[5]
 368 ||      ADDU    A30,A1,A9:A8
 369         ADD     A31,B1,B8
 370         ADDU    B0,A9:A8,A9:A8  ; removed || to avoid cross-path stall below
 371         ADD     B8,A9,A9
 372 ||      STW     A8,*ARG0[6]
 373         STW     A9,*ARG0[7]
 374         .endif
 375         .endasmfunc