crypto/bn/asm/bn-c64xplus.asm

   1 ;;====================================================================
   2 ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   3 ;; project.
   4 ;;
   5 ;; Rights for redistribution and usage in source and binary forms are
   6 ;; granted according to the OpenSSL license. Warranty of any kind is
   7 ;; disclaimed.
   8 ;;====================================================================
   9 ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
  10 ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
  11 ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
  12 ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
  13 ;;====================================================================
  14         .text
  15
  16         .asg    B3,RA
  17         .asg    A4,ARG0
  18         .asg    B4,ARG1
  19         .asg    A6,ARG2
  20         .asg    B6,ARG3
  21         .asg    A8,ARG4
  22         .asg    B8,ARG5
  23         .asg    A4,RET
  24         .asg    A15,FP
  25         .asg    B14,DP
  26         .asg    B15,SP
  27
  28         .global _bn_mul_add_words
  29 _bn_mul_add_words:
  30         .asmfunc
  31         MV      ARG2,B0
  32   [!B0] BNOP    RA
  33 ||[!B0] MVK     0,RET
  34    [B0] MVC     B0,ILC
  35    [B0] ZERO    A19             ; high part of accumulator
  36 || [B0] MV      ARG0,A2
  37 || [B0] MV      ARG3,A3
  38         NOP     3
  39
  40         SPLOOP  2               ; 2*n+10
  41 ;;====================================================================
  42         LDW     *ARG1++,B7      ; ap[i]
  43         NOP     3
  44         LDW     *ARG0++,A7      ; rp[i]
  45         MPY32U  B7,A3,A17:A16
  46         NOP     3               ; [2,0] in epilogue
  47         ADDU    A16,A7,A21:A20
  48         ADDU    A19,A21:A20,A19:A18
  49 ||      MV.S    A17,A23
  50         SPKERNEL 2,1            ; leave slot for "return value"
  51 ||      STW     A18,*A2++       ; rp[i]
  52 ||      ADD     A19,A23,A19
  53 ;;====================================================================
  54         BNOP    RA,4
  55         MV      A19,RET         ; return value
  56         .endasmfunc
  57
  58         .global _bn_mul_words
  59 _bn_mul_words:
  60         .asmfunc
  61         MV      ARG2,B0
  62   [!B0] BNOP    RA
  63 ||[!B0] MVK     0,RET
  64    [B0] MVC     B0,ILC
  65    [B0] ZERO    A19             ; high part of accumulator
  66         NOP     3
  67
  68         SPLOOP  2               ; 2*n+10
  69 ;;====================================================================
  70         LDW     *ARG1++,A7      ; ap[i]
  71         NOP     4
  72         MPY32U  A7,ARG3,A17:A16
  73         NOP     4               ; [2,0] in epiloque
  74         ADDU    A19,A16,A19:A18
  75 ||      MV.S    A17,A21
  76         SPKERNEL 2,1            ; leave slot for "return value"
  77 ||      STW     A18,*ARG0++     ; rp[i]
  78 ||      ADD.L   A19,A21,A19
  79 ;;====================================================================
  80         BNOP    RA,4
  81         MV      A19,RET         ; return value
  82         .endasmfunc
  83
  84         .global _bn_sqr_words
  85 _bn_sqr_words:
  86         .asmfunc
  87         MV      ARG2,B0
  88   [!B0] BNOP    RA
  89 ||[!B0] MVK     0,RET
  90    [B0] MVC     B0,ILC
  91    [B0] MV      ARG0,B2
  92 || [B0] ADD     4,ARG0,ARG0
  93         NOP     3
  94
  95         SPLOOP  2               ; 2*n+10
  96 ;;====================================================================
  97         LDW     *ARG1++,B7      ; ap[i]
  98         NOP     4
  99         MPY32U  B7,B7,B1:B0
 100         NOP     3               ; [2,0] in epilogue
 101         STW     B0,*B2++(8)     ; rp[2*i]
 102         MV      B1,A1
 103         SPKERNEL 2,0            ; fully overlap BNOP RA,5
 104 ||      STW     A1,*ARG0++(8)   ; rp[2*i+1]
 105 ;;====================================================================
 106         BNOP    RA,5
 107         .endasmfunc
 108
 109         .global _bn_add_words
 110 _bn_add_words:
 111         .asmfunc
 112         MV      ARG3,B0
 113   [!B0] BNOP    RA
 114 ||[!B0] MVK     0,RET
 115    [B0] MVC     B0,ILC
 116    [B0] ZERO    A1              ; carry flag
 117 || [B0] MV      ARG0,A3
 118         NOP     3
 119
 120         SPLOOP  2               ; 2*n+6
 121 ;;====================================================================
 122         LDW     *ARG2++,A7      ; bp[i]
 123 ||      LDW     *ARG1++,B7      ; ap[i]
 124         NOP     4
 125         ADDU    A7,B7,A9:A8
 126         ADDU    A1,A9:A8,A1:A0
 127         SPKERNEL 0,0            ; fully overlap BNOP RA,5
 128 ||      STW     A0,*A3++        ; write result
 129 ||      MV      A1,RET          ; keep carry flag in RET
 130 ;;====================================================================
 131         BNOP    RA,5
 132         .endasmfunc
 133
 134         .global _bn_sub_words
 135 _bn_sub_words:
 136         .asmfunc
 137         MV      ARG3,B0
 138   [!B0] BNOP    RA
 139 ||[!B0] MVK     0,RET
 140    [B0] MVC     B0,ILC
 141    [B0] ZERO    A2              ; borrow flag
 142 || [B0] MV      ARG0,A3
 143         NOP     3
 144
 145         SPLOOP  2               ; 2*n+6
 146 ;;====================================================================
 147         LDW     *ARG2++,A7      ; bp[i]
 148 ||      LDW     *ARG1++,B7      ; ap[i]
 149         NOP     4
 150         SUBU    B7,A7,A1:A0
 151   [A2]  SUB     A1:A0,1,A1:A0
 152         SPKERNEL 0,1            ; leave slot for "return borrow flag"
 153 ||      STW     A0,*A3++        ; write result
 154 ||      AND     1,A1,A2         ; pass on borrow flag
 155 ;;====================================================================
 156         BNOP    RA,4
 157         AND     1,A1,RET        ; return borrow flag
 158         .endasmfunc
 159
 160         .global _bn_div_words
 161         .global __divull
 162 _bn_div_words:
 163         .asmfunc
 164         CALLP   __divull,A3     ; jump to rts64plus.lib
 165 ||      MV      ARG0,A5
 166 ||      MV      ARG1,ARG0
 167 ||      MV      ARG2,ARG1
 168 ||      ZERO    B5
 169         .endasmfunc
 170
 171 ;;====================================================================
 172 ;; Not really Comba algorithm, just straightforward NxM... Dedicated
 173 ;; fully unrolled real Comba implementations are asymptotically 2x
 174 ;; faster, but naturally larger undertaking. Purpose of this exercise
 175 ;; was rather to learn to master nested SPLOOPs...
 176 ;;====================================================================
 177         .global _bn_sqr_comba8
 178         .global _bn_mul_comba8
 179 _bn_sqr_comba8:
 180         MV      ARG1,ARG2
 181 _bn_mul_comba8:
 182         .asmfunc
 183         MVK     8,B0            ; N, RILC
 184 ||      MVK     8,A0            ; M, outer loop counter
 185 ||      MV      ARG1,A5         ; copy ap
 186 ||      MV      ARG0,B4         ; copy rp
 187 ||      ZERO    B19             ; high part of accumulator
 188         MVC     B0,RILC
 189 ||      SUB     B0,2,B1         ; N-2, initial ILC
 190 ||      SUB     B0,1,B2         ; const B2=N-1
 191 ||      LDW     *A5++,B6        ; ap[0]
 192 ||      MV      A0,A3           ; const A3=M
 193 sploopNxM?:                     ; for best performance arrange M<=N
 194    [A0] SPLOOPD 2               ; 2*n+10
 195 ||      MVC     B1,ILC
 196 ||      ADDAW   B4,B0,B5
 197 ||      ZERO    B7
 198 ||      LDW     *A5++,A9        ; pre-fetch ap[1]
 199 ||      ZERO    A1
 200 ||      SUB     A0,1,A0
 201 ;;====================================================================
 202 ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
 203 ;; This is because of Advisory 15 from TI publication SPRZ247I.
 204         LDW     *ARG2++,A7      ; bp[i]
 205         NOP     3
 206    [A1] LDW     *B5++,B7        ; rp[i]
 207         MPY32U  A7,B6,B17:B16
 208         NOP     3
 209         ADDU    B16,B7,B21:B20
 210         ADDU    B19,B21:B20,B19:B18
 211 ||      MV.S    B17,B23
 212         SPKERNEL
 213 ||      STW     B18,*B4++       ; rp[i]
 214 ||      ADD.S   B19,B23,B19
 215 ;;====================================================================
 216 outer?:                         ; m*2*(n+1)+10
 217         SUBAW   ARG2,A3,ARG2    ; rewind bp to bp[0]
 218         SPMASKR
 219 ||      CMPGT   A0,1,A2         ; done pre-fetching ap[i+1]?
 220         MVD     A9,B6           ; move through .M unit(*)
 221    [A2] LDW     *A5++,A9        ; pre-fetch ap[i+1]
 222         SUBAW   B5,B2,B5        ; rewind rp to rp[1]
 223         MVK     1,A1
 224    [A0] BNOP.S1 outer?,4
 225 || [A0] SUB.L   A0,1,A0
 226         STW     B19,*B4--[B2]   ; rewind rp tp rp[1]
 227 ||      ZERO.S  B19             ; high part of accumulator
 228 ;; end of outer?
 229         BNOP    RA,5            ; return
 230         .endasmfunc
 231 ;; (*)  It should be noted that B6 is used as input to MPY32U in
 232 ;;      chronologically next cycle in *preceding* SPLOOP iteration.
 233 ;;      Normally such arrangement would require DINT, but at this
 234 ;;      point SPLOOP is draining and interrupts are disabled
 235 ;;      implicitly.
 236
 237         .global _bn_sqr_comba4
 238         .global _bn_mul_comba4
 239 _bn_sqr_comba4:
 240         MV      ARG1,ARG2
 241 _bn_mul_comba4:
 242         .asmfunc
 243         .if     0
 244         BNOP    sploopNxM?,3
 245         ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
 246         ;; because of read-after-write penalties, it's rather
 247         ;; n*2*(n+3)+10, or 66 cycles [plus various overheads]...
 248         MVK     4,B0            ; N, RILC
 249 ||      MVK     4,A0            ; M, outer loop counter
 250 ||      MV      ARG1,A5         ; copy ap
 251 ||      MV      ARG0,B4         ; copy rp
 252 ||      ZERO    B19             ; high part of accumulator
 253         MVC     B0,RILC
 254 ||      SUB     B0,2,B1         ; first ILC
 255 ||      SUB     B0,1,B2         ; const B2=N-1
 256 ||      LDW     *A5++,B6        ; ap[0]
 257 ||      MV      A0,A3           ; const A3=M
 258         .else
 259         ;; This alternative is exercise in fully unrolled Comba
 260         ;; algorithm implementation that operates at n*(n+1)+12, or
 261         ;; as little as 32 cycles...
 262         LDW     *ARG1[0],B16    ; a[0]
 263 ||      LDW     *ARG2[0],A16    ; b[0]
 264         LDW     *ARG1[1],B17    ; a[1]
 265 ||      LDW     *ARG2[1],A17    ; b[1]
 266         LDW     *ARG1[2],B18    ; a[2]
 267 ||      LDW     *ARG2[2],A18    ; b[2]
 268         LDW     *ARG1[3],B19    ; a[3]
 269 ||      LDW     *ARG2[3],A19    ; b[3]
 270         NOP
 271         MPY32U  A16,B16,A1:A0   ; a[0]*b[0]
 272         MPY32U  A17,B16,A23:A22 ; a[0]*b[1]
 273         MPY32U  A16,B17,A25:A24 ; a[1]*b[0]
 274         MPY32U  A16,B18,A27:A26 ; a[2]*b[0]
 275         STW     A0,*ARG0[0]
 276 ||      MPY32U  A17,B17,A29:A28 ; a[1]*b[1]
 277         MPY32U  A18,B16,A31:A30 ; a[0]*b[2]
 278 ||      ADDU    A22,A1,A1:A0
 279         MV      A23,B0
 280 ||      MPY32U  A19,B16,A21:A20 ; a[3]*b[0]
 281 ||      ADDU    A24,A1:A0,A1:A0
 282         ADDU    A25,B0,B1:B0
 283 ||      STW     A0,*ARG0[1]
 284 ||      MPY32U  A18,B17,A23:A22 ; a[2]*b[1]
 285 ||      ADDU    A26,A1,A9:A8
 286         ADDU    A27,B1,B9:B8
 287 ||      MPY32U  A17,B18,A25:A24 ; a[1]*b[2]
 288 ||      ADDU    A28,A9:A8,A9:A8
 289         ADDU    A29,B9:B8,B9:B8
 290 ||      MPY32U  A16,B19,A27:A26 ; a[0]*b[3]
 291 ||      ADDU    A30,A9:A8,A9:A8
 292         ADDU    A31,B9:B8,B9:B8
 293 ||      ADDU    B0,A9:A8,A9:A8
 294         STW     A8,*ARG0[2]
 295 ||      ADDU    A20,A9,A1:A0
 296         ADDU    A21,B9,B1:B0
 297 ||      MPY32U  A19,B17,A21:A20 ; a[3]*b[1]
 298 ||      ADDU    A22,A1:A0,A1:A0
 299         ADDU    A23,B1:B0,B1:B0
 300 ||      MPY32U  A18,B18,A23:A22 ; a[2]*b[2]
 301 ||      ADDU    A24,A1:A0,A1:A0
 302         ADDU    A25,B1:B0,B1:B0
 303 ||      MPY32U  A17,B19,A25:A24 ; a[1]*b[3]
 304 ||      ADDU    A26,A1:A0,A1:A0
 305         ADDU    A27,B1:B0,B1:B0
 306 ||      ADDU    B8,A1:A0,A1:A0
 307         STW     A0,*ARG0[3]
 308 ||      MPY32U  A19,B18,A27:A26 ; a[3]*b[2]
 309 ||      ADDU    A20,A1,A9:A8
 310         ADDU    A21,B1,B9:B8
 311 ||      MPY32U  A18,B19,A29:A28 ; a[2]*b[3]
 312 ||      ADDU    A22,A9:A8,A9:A8
 313         ADDU    A23,B9:B8,B9:B8
 314 ||      MPY32U  A19,B19,A31:A30 ; a[3]*b[3]
 315 ||      ADDU    A24,A9:A8,A9:A8
 316         ADDU    A25,B9:B8,B9:B8
 317 ||      ADDU    B0,A9:A8,A9:A8
 318         STW     A8,*ARG0[4]
 319 ||      ADDU    A26,A9,A1:A0
 320         ADDU    A27,B9,B1:B0
 321 ||      ADDU    A28,A1:A0,A1:A0
 322         ADDU    A29,B1:B0,B1:B0
 323 ||      BNOP    RA
 324 ||      ADDU    B8,A1:A0,A1:A0
 325         STW     A0,*ARG0[5]
 326 ||      ADDU    A30,A1,A9:A8
 327         ADD     A31,B1,B8
 328         ADDU    B0,A9:A8,A9:A8  ; removed || to avoid cross-path stall below
 329         ADD     B8,A9,A9
 330 ||      STW     A8,*ARG0[6]
 331         STW     A9,*ARG0[7]
 332         .endif
 333         .endasmfunc