crypto/bn/asm/ppc.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 # Implemented as a Perl wrapper as we want to support several different
  10 # architectures with single file. We pick up the target based on the
  11 # file name we are asked to generate.
  12 #
  13 # It should be noted though that this perl code is nothing like
  14 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
  15 # as pre-processor to cover for platform differences in name decoration,
  16 # linker tables, 32-/64-bit instruction sets...
  17 #
  18 # As you might know there're several PowerPC ABI in use. Most notably
  19 # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
  20 # are similar enough to implement leaf(!) functions, which would be ABI
  21 # neutral. And that's what you find here: ABI neutral leaf functions.
  22 # In case you wonder what that is...
  23 #
  24 #       AIX performance
  25 #
  26 #       MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
  27 #
  28 #       The following is the performance of 32-bit compiler
  29 #       generated code:
  30 #
  31 #       OpenSSL 0.9.6c 21 dec 2001
  32 #       built on: Tue Jun 11 11:06:51 EDT 2002
  33 #       options:bn(64,32) ...
  34 #compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
  35 #                  sign    verify    sign/s verify/s
  36 #rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
  37 #rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
  38 #rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
  39 #rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
  40 #dsa  512 bits   0.0087s   0.0106s    114.3     94.5
  41 #dsa 1024 bits   0.0256s   0.0313s     39.0     32.0
  42 #
  43 #       Same bechmark with this assembler code:
  44 #
  45 #rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
  46 #rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
  47 #rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
  48 #rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
  49 #dsa  512 bits   0.0052s   0.0062s    191.6    162.0
  50 #dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
  51 #
  52 #       Number of operations increases by at almost 75%
  53 #
  54 #       Here are performance numbers for 64-bit compiler
  55 #       generated code:
  56 #
  57 #       OpenSSL 0.9.6g [engine] 9 Aug 2002
  58 #       built on: Fri Apr 18 16:59:20 EDT 2003
  59 #       options:bn(64,64) ...
  60 #       compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
  61 #                  sign    verify    sign/s verify/s
  62 #rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
  63 #rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
  64 #rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
  65 #rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
  66 #dsa  512 bits   0.0026s   0.0032s    382.5    313.7
  67 #dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
  68 #
  69 #       Same benchmark with this assembler code:
  70 #
  71 #rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
  72 #rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
  73 #rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
  74 #rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
  75 #dsa  512 bits   0.0016s   0.0020s    610.7    507.1
  76 #dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
  77 #
  78 #       Again, performance increases by at about 75%
  79 #
  80 #       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
  81 #       OpenSSL 0.9.7c 30 Sep 2003
  82 #
  83 #       Original code.
  84 #
  85 #rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
  86 #rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
  87 #rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
  88 #rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
  89 #dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
  90 #dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
  91 #dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
  92 #
  93 #       Same benchmark with this assembler code:
  94 #
  95 #rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
  96 #rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
  97 #rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
  98 #rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
  99 #dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
 100 #dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
 101 #dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
 102 #
 103 #        Performance increase of ~60%
 104 #
 105 #       If you have comments or suggestions to improve code send
 106 #       me a note at schari@us.ibm.com
 107 #
 108
 109 $flavour = shift;
 110
 111 if ($flavour =~ /32/) {
 112         $BITS=  32;
 113         $BNSZ=  $BITS/8;
 114         $ISA=   "\"ppc\"";
 115
 116         $LD=    "lwz";          # load
 117         $LDU=   "lwzu";         # load and update
 118         $ST=    "stw";          # store
 119         $STU=   "stwu";         # store and update
 120         $UMULL= "mullw";        # unsigned multiply low
 121         $UMULH= "mulhwu";       # unsigned multiply high
 122         $UDIV=  "divwu";        # unsigned divide
 123         $UCMPI= "cmplwi";       # unsigned compare with immediate
 124         $UCMP=  "cmplw";        # unsigned compare
 125         $CNTLZ= "cntlzw";       # count leading zeros
 126         $SHL=   "slw";          # shift left
 127         $SHR=   "srw";          # unsigned shift right
 128         $SHRI=  "srwi";         # unsigned shift right by immediate
 129         $SHLI=  "slwi";         # shift left by immediate
 130         $CLRU=  "clrlwi";       # clear upper bits
 131         $INSR=  "insrwi";       # insert right
 132         $ROTL=  "rotlwi";       # rotate left by immediate
 133         $TR=    "tw";           # conditional trap
 134 } elsif ($flavour =~ /64/) {
 135         $BITS=  64;
 136         $BNSZ=  $BITS/8;
 137         $ISA=   "\"ppc64\"";
 138
 139         # same as above, but 64-bit mnemonics...
 140         $LD=    "ld";           # load
 141         $LDU=   "ldu";          # load and update
 142         $ST=    "std";          # store
 143         $STU=   "stdu";         # store and update
 144         $UMULL= "mulld";        # unsigned multiply low
 145         $UMULH= "mulhdu";       # unsigned multiply high
 146         $UDIV=  "divdu";        # unsigned divide
 147         $UCMPI= "cmpldi";       # unsigned compare with immediate
 148         $UCMP=  "cmpld";        # unsigned compare
 149         $CNTLZ= "cntlzd";       # count leading zeros
 150         $SHL=   "sld";          # shift left
 151         $SHR=   "srd";          # unsigned shift right
 152         $SHRI=  "srdi";         # unsigned shift right by immediate
 153         $SHLI=  "sldi";         # shift left by immediate
 154         $CLRU=  "clrldi";       # clear upper bits
 155         $INSR=  "insrdi";       # insert right
 156         $ROTL=  "rotldi";       # rotate left by immediate
 157         $TR=    "td";           # conditional trap
 158 } else { die "nonsense $flavour"; }
 159
 160 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 161 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 162 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
 163 die "can't locate ppc-xlate.pl";
 164
 165 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 166
 167 $data=<<EOF;
 168 #--------------------------------------------------------------------
 169 #
 170 #
 171 #
 172 #
 173 #       File:           ppc32.s
 174 #
 175 #       Created by:     Suresh Chari
 176 #                       IBM Thomas J. Watson Research Library
 177 #                       Hawthorne, NY
 178 #
 179 #
 180 #       Description:    Optimized assembly routines for OpenSSL crypto
 181 #                       on the 32 bitPowerPC platform.
 182 #
 183 #
 184 #       Version History
 185 #
 186 #       2. Fixed bn_add,bn_sub and bn_div_words, added comments,
 187 #          cleaned up code. Also made a single version which can
 188 #          be used for both the AIX and Linux compilers. See NOTE
 189 #          below.
 190 #                               12/05/03                Suresh Chari
 191 #                       (with lots of help from)        Andy Polyakov
 192 ##
 193 #       1. Initial version      10/20/02                Suresh Chari
 194 #
 195 #
 196 #       The following file works for the xlc,cc
 197 #       and gcc compilers.
 198 #
 199 #       NOTE:   To get the file to link correctly with the gcc compiler
 200 #               you have to change the names of the routines and remove
 201 #               the first .(dot) character. This should automatically
 202 #               be done in the build process.
 203 #
 204 #       Hand optimized assembly code for the following routines
 205 #
 206 #       bn_sqr_comba4
 207 #       bn_sqr_comba8
 208 #       bn_mul_comba4
 209 #       bn_mul_comba8
 210 #       bn_sub_words
 211 #       bn_add_words
 212 #       bn_div_words
 213 #       bn_sqr_words
 214 #       bn_mul_words
 215 #       bn_mul_add_words
 216 #
 217 #       NOTE:   It is possible to optimize this code more for
 218 #       specific PowerPC or Power architectures. On the Northstar
 219 #       architecture the optimizations in this file do
 220 #        NOT provide much improvement.
 221 #
 222 #       If you have comments or suggestions to improve code send
 223 #       me a note at schari\@us.ibm.com
 224 #
 225 #--------------------------------------------------------------------------
 226 #
 227 #       Defines to be used in the assembly code.
 228 #
 229 #.set r0,0      # we use it as storage for value of 0
 230 #.set SP,1      # preserved
 231 #.set RTOC,2    # preserved
 232 #.set r3,3      # 1st argument/return value
 233 #.set r4,4      # 2nd argument/volatile register
 234 #.set r5,5      # 3rd argument/volatile register
 235 #.set r6,6      # ...
 236 #.set r7,7
 237 #.set r8,8
 238 #.set r9,9
 239 #.set r10,10
 240 #.set r11,11
 241 #.set r12,12
 242 #.set r13,13    # not used, nor any other "below" it...
 243
 244 #       Declare function names to be global
 245 #       NOTE:   For gcc these names MUST be changed to remove
 246 #               the first . i.e. for example change ".bn_sqr_comba4"
 247 #               to "bn_sqr_comba4". This should be automatically done
 248 #               in the build.
 249
 250         .globl  .bn_sqr_comba4
 251         .globl  .bn_sqr_comba8
 252         .globl  .bn_mul_comba4
 253         .globl  .bn_mul_comba8
 254         .globl  .bn_sub_words
 255         .globl  .bn_add_words
 256         .globl  .bn_div_words
 257         .globl  .bn_sqr_words
 258         .globl  .bn_mul_words
 259         .globl  .bn_mul_add_words
 260
 261 # .text section
 262
 263         .machine        "any"
 264
 265 #
 266 #       NOTE:   The following label name should be changed to
 267 #               "bn_sqr_comba4" i.e. remove the first dot
 268 #               for the gcc compiler. This should be automatically
 269 #               done in the build
 270 #
 271
 272 .align  4
 273 .bn_sqr_comba4:
 274 #
 275 # Optimized version of bn_sqr_comba4.
 276 #
 277 # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
 278 # r3 contains r
 279 # r4 contains a
 280 #
 281 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
 282 #
 283 # r5,r6 are the two BN_ULONGs being multiplied.
 284 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
 285 # r9,r10, r11 are the equivalents of c1,c2, c3.
 286 # Here's the assembly
 287 #
 288 #
 289         xor             r0,r0,r0                # set r0 = 0. Used in the addze
 290                                                 # instructions below
 291
 292                                                 #sqr_add_c(a,0,c1,c2,c3)
 293         $LD             r5,`0*$BNSZ`(r4)
 294         $UMULL          r9,r5,r5
 295         $UMULH          r10,r5,r5               #in first iteration. No need
 296                                                 #to add since c1=c2=c3=0.
 297                                                 # Note c3(r11) is NOT set to 0
 298                                                 # but will be.
 299
 300         $ST             r9,`0*$BNSZ`(r3)        # r[0]=c1;
 301                                                 # sqr_add_c2(a,1,0,c2,c3,c1);
 302         $LD             r6,`1*$BNSZ`(r4)
 303         $UMULL          r7,r5,r6
 304         $UMULH          r8,r5,r6
 305
 306         addc            r7,r7,r7                # compute (r7,r8)=2*(r7,r8)
 307         adde            r8,r8,r8
 308         addze           r9,r0                   # catch carry if any.
 309                                                 # r9= r0(=0) and carry
 310
 311         addc            r10,r7,r10              # now add to temp result.
 312         addze           r11,r8                  # r8 added to r11 which is 0
 313         addze           r9,r9
 314
 315         $ST             r10,`1*$BNSZ`(r3)       #r[1]=c2;
 316                                                 #sqr_add_c(a,1,c3,c1,c2)
 317         $UMULL          r7,r6,r6
 318         $UMULH          r8,r6,r6
 319         addc            r11,r7,r11
 320         adde            r9,r8,r9
 321         addze           r10,r0
 322                                                 #sqr_add_c2(a,2,0,c3,c1,c2)
 323         $LD             r6,`2*$BNSZ`(r4)
 324         $UMULL          r7,r5,r6
 325         $UMULH          r8,r5,r6
 326
 327         addc            r7,r7,r7
 328         adde            r8,r8,r8
 329         addze           r10,r10
 330
 331         addc            r11,r7,r11
 332         adde            r9,r8,r9
 333         addze           r10,r10
 334         $ST             r11,`2*$BNSZ`(r3)       #r[2]=c3
 335                                                 #sqr_add_c2(a,3,0,c1,c2,c3);
 336         $LD             r6,`3*$BNSZ`(r4)
 337         $UMULL          r7,r5,r6
 338         $UMULH          r8,r5,r6
 339         addc            r7,r7,r7
 340         adde            r8,r8,r8
 341         addze           r11,r0
 342
 343         addc            r9,r7,r9
 344         adde            r10,r8,r10
 345         addze           r11,r11
 346                                                 #sqr_add_c2(a,2,1,c1,c2,c3);
 347         $LD             r5,`1*$BNSZ`(r4)
 348         $LD             r6,`2*$BNSZ`(r4)
 349         $UMULL          r7,r5,r6
 350         $UMULH          r8,r5,r6
 351
 352         addc            r7,r7,r7
 353         adde            r8,r8,r8
 354         addze           r11,r11
 355         addc            r9,r7,r9
 356         adde            r10,r8,r10
 357         addze           r11,r11
 358         $ST             r9,`3*$BNSZ`(r3)        #r[3]=c1
 359                                                 #sqr_add_c(a,2,c2,c3,c1);
 360         $UMULL          r7,r6,r6
 361         $UMULH          r8,r6,r6
 362         addc            r10,r7,r10
 363         adde            r11,r8,r11
 364         addze           r9,r0
 365                                                 #sqr_add_c2(a,3,1,c2,c3,c1);
 366         $LD             r6,`3*$BNSZ`(r4)
 367         $UMULL          r7,r5,r6
 368         $UMULH          r8,r5,r6
 369         addc            r7,r7,r7
 370         adde            r8,r8,r8
 371         addze           r9,r9
 372
 373         addc            r10,r7,r10
 374         adde            r11,r8,r11
 375         addze           r9,r9
 376         $ST             r10,`4*$BNSZ`(r3)       #r[4]=c2
 377                                                 #sqr_add_c2(a,3,2,c3,c1,c2);
 378         $LD             r5,`2*$BNSZ`(r4)
 379         $UMULL          r7,r5,r6
 380         $UMULH          r8,r5,r6
 381         addc            r7,r7,r7
 382         adde            r8,r8,r8
 383         addze           r10,r0
 384
 385         addc            r11,r7,r11
 386         adde            r9,r8,r9
 387         addze           r10,r10
 388         $ST             r11,`5*$BNSZ`(r3)       #r[5] = c3
 389                                                 #sqr_add_c(a,3,c1,c2,c3);
 390         $UMULL          r7,r6,r6
 391         $UMULH          r8,r6,r6
 392         addc            r9,r7,r9
 393         adde            r10,r8,r10
 394
 395         $ST             r9,`6*$BNSZ`(r3)        #r[6]=c1
 396         $ST             r10,`7*$BNSZ`(r3)       #r[7]=c2
 397         blr
 398         .long   0
 399         .byte   0,12,0x14,0,0,0,2,0
 400         .long   0
 401 .size   .bn_sqr_comba4,.-.bn_sqr_comba4
 402
 403 #
 404 #       NOTE:   The following label name should be changed to
 405 #               "bn_sqr_comba8" i.e. remove the first dot
 406 #               for the gcc compiler. This should be automatically
 407 #               done in the build
 408 #
 409
 410 .align  4
 411 .bn_sqr_comba8:
 412 #
 413 # This is an optimized version of the bn_sqr_comba8 routine.
 414 # Tightly uses the adde instruction
 415 #
 416 #
 417 # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
 418 # r3 contains r
 419 # r4 contains a
 420 #
 421 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
 422 #
 423 # r5,r6 are the two BN_ULONGs being multiplied.
 424 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
 425 # r9,r10, r11 are the equivalents of c1,c2, c3.
 426 #
 427 # Possible optimization of loading all 8 longs of a into registers
 428 # doesn't provide any speedup
 429 #
 430
 431         xor             r0,r0,r0                #set r0 = 0.Used in addze
 432                                                 #instructions below.
 433
 434                                                 #sqr_add_c(a,0,c1,c2,c3);
 435         $LD             r5,`0*$BNSZ`(r4)
 436         $UMULL          r9,r5,r5                #1st iteration: no carries.
 437         $UMULH          r10,r5,r5
 438         $ST             r9,`0*$BNSZ`(r3)        # r[0]=c1;
 439                                                 #sqr_add_c2(a,1,0,c2,c3,c1);
 440         $LD             r6,`1*$BNSZ`(r4)
 441         $UMULL          r7,r5,r6
 442         $UMULH          r8,r5,r6
 443
 444         addc            r10,r7,r10              #add the two register number
 445         adde            r11,r8,r0               # (r8,r7) to the three register
 446         addze           r9,r0                   # number (r9,r11,r10).NOTE:r0=0
 447
 448         addc            r10,r7,r10              #add the two register number
 449         adde            r11,r8,r11              # (r8,r7) to the three register
 450         addze           r9,r9                   # number (r9,r11,r10).
 451
 452         $ST             r10,`1*$BNSZ`(r3)       # r[1]=c2
 453
 454                                                 #sqr_add_c(a,1,c3,c1,c2);
 455         $UMULL          r7,r6,r6
 456         $UMULH          r8,r6,r6
 457         addc            r11,r7,r11
 458         adde            r9,r8,r9
 459         addze           r10,r0
 460                                                 #sqr_add_c2(a,2,0,c3,c1,c2);
 461         $LD             r6,`2*$BNSZ`(r4)
 462         $UMULL          r7,r5,r6
 463         $UMULH          r8,r5,r6
 464
 465         addc            r11,r7,r11
 466         adde            r9,r8,r9
 467         addze           r10,r10
 468
 469         addc            r11,r7,r11
 470         adde            r9,r8,r9
 471         addze           r10,r10
 472
 473         $ST             r11,`2*$BNSZ`(r3)       #r[2]=c3
 474                                                 #sqr_add_c2(a,3,0,c1,c2,c3);
 475         $LD             r6,`3*$BNSZ`(r4)        #r6 = a[3]. r5 is already a[0].
 476         $UMULL          r7,r5,r6
 477         $UMULH          r8,r5,r6
 478
 479         addc            r9,r7,r9
 480         adde            r10,r8,r10
 481         addze           r11,r0
 482
 483         addc            r9,r7,r9
 484         adde            r10,r8,r10
 485         addze           r11,r11
 486                                                 #sqr_add_c2(a,2,1,c1,c2,c3);
 487         $LD             r5,`1*$BNSZ`(r4)
 488         $LD             r6,`2*$BNSZ`(r4)
 489         $UMULL          r7,r5,r6
 490         $UMULH          r8,r5,r6
 491
 492         addc            r9,r7,r9
 493         adde            r10,r8,r10
 494         addze           r11,r11
 495
 496         addc            r9,r7,r9
 497         adde            r10,r8,r10
 498         addze           r11,r11
 499
 500         $ST             r9,`3*$BNSZ`(r3)        #r[3]=c1;
 501                                                 #sqr_add_c(a,2,c2,c3,c1);
 502         $UMULL          r7,r6,r6
 503         $UMULH          r8,r6,r6
 504
 505         addc            r10,r7,r10
 506         adde            r11,r8,r11
 507         addze           r9,r0
 508                                                 #sqr_add_c2(a,3,1,c2,c3,c1);
 509         $LD             r6,`3*$BNSZ`(r4)
 510         $UMULL          r7,r5,r6
 511         $UMULH          r8,r5,r6
 512
 513         addc            r10,r7,r10
 514         adde            r11,r8,r11
 515         addze           r9,r9
 516
 517         addc            r10,r7,r10
 518         adde            r11,r8,r11
 519         addze           r9,r9
 520                                                 #sqr_add_c2(a,4,0,c2,c3,c1);
 521         $LD             r5,`0*$BNSZ`(r4)
 522         $LD             r6,`4*$BNSZ`(r4)
 523         $UMULL          r7,r5,r6
 524         $UMULH          r8,r5,r6
 525
 526         addc            r10,r7,r10
 527         adde            r11,r8,r11
 528         addze           r9,r9
 529
 530         addc            r10,r7,r10
 531         adde            r11,r8,r11
 532         addze           r9,r9
 533         $ST             r10,`4*$BNSZ`(r3)       #r[4]=c2;
 534                                                 #sqr_add_c2(a,5,0,c3,c1,c2);
 535         $LD             r6,`5*$BNSZ`(r4)
 536         $UMULL          r7,r5,r6
 537         $UMULH          r8,r5,r6
 538
 539         addc            r11,r7,r11
 540         adde            r9,r8,r9
 541         addze           r10,r0
 542
 543         addc            r11,r7,r11
 544         adde            r9,r8,r9
 545         addze           r10,r10
 546                                                 #sqr_add_c2(a,4,1,c3,c1,c2);
 547         $LD             r5,`1*$BNSZ`(r4)
 548         $LD             r6,`4*$BNSZ`(r4)
 549         $UMULL          r7,r5,r6
 550         $UMULH          r8,r5,r6
 551
 552         addc            r11,r7,r11
 553         adde            r9,r8,r9
 554         addze           r10,r10
 555
 556         addc            r11,r7,r11
 557         adde            r9,r8,r9
 558         addze           r10,r10
 559                                                 #sqr_add_c2(a,3,2,c3,c1,c2);
 560         $LD             r5,`2*$BNSZ`(r4)
 561         $LD             r6,`3*$BNSZ`(r4)
 562         $UMULL          r7,r5,r6
 563         $UMULH          r8,r5,r6
 564
 565         addc            r11,r7,r11
 566         adde            r9,r8,r9
 567         addze           r10,r10
 568
 569         addc            r11,r7,r11
 570         adde            r9,r8,r9
 571         addze           r10,r10
 572         $ST             r11,`5*$BNSZ`(r3)       #r[5]=c3;
 573                                                 #sqr_add_c(a,3,c1,c2,c3);
 574         $UMULL          r7,r6,r6
 575         $UMULH          r8,r6,r6
 576         addc            r9,r7,r9
 577         adde            r10,r8,r10
 578         addze           r11,r0
 579                                                 #sqr_add_c2(a,4,2,c1,c2,c3);
 580         $LD             r6,`4*$BNSZ`(r4)
 581         $UMULL          r7,r5,r6
 582         $UMULH          r8,r5,r6
 583
 584         addc            r9,r7,r9
 585         adde            r10,r8,r10
 586         addze           r11,r11
 587
 588         addc            r9,r7,r9
 589         adde            r10,r8,r10
 590         addze           r11,r11
 591                                                 #sqr_add_c2(a,5,1,c1,c2,c3);
 592         $LD             r5,`1*$BNSZ`(r4)
 593         $LD             r6,`5*$BNSZ`(r4)
 594         $UMULL          r7,r5,r6
 595         $UMULH          r8,r5,r6
 596
 597         addc            r9,r7,r9
 598         adde            r10,r8,r10
 599         addze           r11,r11
 600
 601         addc            r9,r7,r9
 602         adde            r10,r8,r10
 603         addze           r11,r11
 604                                                 #sqr_add_c2(a,6,0,c1,c2,c3);
 605         $LD             r5,`0*$BNSZ`(r4)
 606         $LD             r6,`6*$BNSZ`(r4)
 607         $UMULL          r7,r5,r6
 608         $UMULH          r8,r5,r6
 609         addc            r9,r7,r9
 610         adde            r10,r8,r10
 611         addze           r11,r11
 612         addc            r9,r7,r9
 613         adde            r10,r8,r10
 614         addze           r11,r11
 615         $ST             r9,`6*$BNSZ`(r3)        #r[6]=c1;
 616                                                 #sqr_add_c2(a,7,0,c2,c3,c1);
 617         $LD             r6,`7*$BNSZ`(r4)
 618         $UMULL          r7,r5,r6
 619         $UMULH          r8,r5,r6
 620
 621         addc            r10,r7,r10
 622         adde            r11,r8,r11
 623         addze           r9,r0
 624         addc            r10,r7,r10
 625         adde            r11,r8,r11
 626         addze           r9,r9
 627                                                 #sqr_add_c2(a,6,1,c2,c3,c1);
 628         $LD             r5,`1*$BNSZ`(r4)
 629         $LD             r6,`6*$BNSZ`(r4)
 630         $UMULL          r7,r5,r6
 631         $UMULH          r8,r5,r6
 632
 633         addc            r10,r7,r10
 634         adde            r11,r8,r11
 635         addze           r9,r9
 636         addc            r10,r7,r10
 637         adde            r11,r8,r11
 638         addze           r9,r9
 639                                                 #sqr_add_c2(a,5,2,c2,c3,c1);
 640         $LD             r5,`2*$BNSZ`(r4)
 641         $LD             r6,`5*$BNSZ`(r4)
 642         $UMULL          r7,r5,r6
 643         $UMULH          r8,r5,r6
 644         addc            r10,r7,r10
 645         adde            r11,r8,r11
 646         addze           r9,r9
 647         addc            r10,r7,r10
 648         adde            r11,r8,r11
 649         addze           r9,r9
 650                                                 #sqr_add_c2(a,4,3,c2,c3,c1);
 651         $LD             r5,`3*$BNSZ`(r4)
 652         $LD             r6,`4*$BNSZ`(r4)
 653         $UMULL          r7,r5,r6
 654         $UMULH          r8,r5,r6
 655
 656         addc            r10,r7,r10
 657         adde            r11,r8,r11
 658         addze           r9,r9
 659         addc            r10,r7,r10
 660         adde            r11,r8,r11
 661         addze           r9,r9
 662         $ST             r10,`7*$BNSZ`(r3)       #r[7]=c2;
 663                                                 #sqr_add_c(a,4,c3,c1,c2);
 664         $UMULL          r7,r6,r6
 665         $UMULH          r8,r6,r6
 666         addc            r11,r7,r11
 667         adde            r9,r8,r9
 668         addze           r10,r0
 669                                                 #sqr_add_c2(a,5,3,c3,c1,c2);
 670         $LD             r6,`5*$BNSZ`(r4)
 671         $UMULL          r7,r5,r6
 672         $UMULH          r8,r5,r6
 673         addc            r11,r7,r11
 674         adde            r9,r8,r9
 675         addze           r10,r10
 676         addc            r11,r7,r11
 677         adde            r9,r8,r9
 678         addze           r10,r10
 679                                                 #sqr_add_c2(a,6,2,c3,c1,c2);
 680         $LD             r5,`2*$BNSZ`(r4)
 681         $LD             r6,`6*$BNSZ`(r4)
 682         $UMULL          r7,r5,r6
 683         $UMULH          r8,r5,r6
 684         addc            r11,r7,r11
 685         adde            r9,r8,r9
 686         addze           r10,r10
 687
 688         addc            r11,r7,r11
 689         adde            r9,r8,r9
 690         addze           r10,r10
 691                                                 #sqr_add_c2(a,7,1,c3,c1,c2);
 692         $LD             r5,`1*$BNSZ`(r4)
 693         $LD             r6,`7*$BNSZ`(r4)
 694         $UMULL          r7,r5,r6
 695         $UMULH          r8,r5,r6
 696         addc            r11,r7,r11
 697         adde            r9,r8,r9
 698         addze           r10,r10
 699         addc            r11,r7,r11
 700         adde            r9,r8,r9
 701         addze           r10,r10
 702         $ST             r11,`8*$BNSZ`(r3)       #r[8]=c3;
 703                                                 #sqr_add_c2(a,7,2,c1,c2,c3);
 704         $LD             r5,`2*$BNSZ`(r4)
 705         $UMULL          r7,r5,r6
 706         $UMULH          r8,r5,r6
 707
 708         addc            r9,r7,r9
 709         adde            r10,r8,r10
 710         addze           r11,r0
 711         addc            r9,r7,r9
 712         adde            r10,r8,r10
 713         addze           r11,r11
 714                                                 #sqr_add_c2(a,6,3,c1,c2,c3);
 715         $LD             r5,`3*$BNSZ`(r4)
 716         $LD             r6,`6*$BNSZ`(r4)
 717         $UMULL          r7,r5,r6
 718         $UMULH          r8,r5,r6
 719         addc            r9,r7,r9
 720         adde            r10,r8,r10
 721         addze           r11,r11
 722         addc            r9,r7,r9
 723         adde            r10,r8,r10
 724         addze           r11,r11
 725                                                 #sqr_add_c2(a,5,4,c1,c2,c3);
 726         $LD             r5,`4*$BNSZ`(r4)
 727         $LD             r6,`5*$BNSZ`(r4)
 728         $UMULL          r7,r5,r6
 729         $UMULH          r8,r5,r6
 730         addc            r9,r7,r9
 731         adde            r10,r8,r10
 732         addze           r11,r11
 733         addc            r9,r7,r9
 734         adde            r10,r8,r10
 735         addze           r11,r11
 736         $ST             r9,`9*$BNSZ`(r3)        #r[9]=c1;
 737                                                 #sqr_add_c(a,5,c2,c3,c1);
 738         $UMULL          r7,r6,r6
 739         $UMULH          r8,r6,r6
 740         addc            r10,r7,r10
 741         adde            r11,r8,r11
 742         addze           r9,r0
 743                                                 #sqr_add_c2(a,6,4,c2,c3,c1);
 744         $LD             r6,`6*$BNSZ`(r4)
 745         $UMULL          r7,r5,r6
 746         $UMULH          r8,r5,r6
 747         addc            r10,r7,r10
 748         adde            r11,r8,r11
 749         addze           r9,r9
 750         addc            r10,r7,r10
 751         adde            r11,r8,r11
 752         addze           r9,r9
 753                                                 #sqr_add_c2(a,7,3,c2,c3,c1);
 754         $LD             r5,`3*$BNSZ`(r4)
 755         $LD             r6,`7*$BNSZ`(r4)
 756         $UMULL          r7,r5,r6
 757         $UMULH          r8,r5,r6
 758         addc            r10,r7,r10
 759         adde            r11,r8,r11
 760         addze           r9,r9
 761         addc            r10,r7,r10
 762         adde            r11,r8,r11
 763         addze           r9,r9
 764         $ST             r10,`10*$BNSZ`(r3)      #r[10]=c2;
 765                                                 #sqr_add_c2(a,7,4,c3,c1,c2);
 766         $LD             r5,`4*$BNSZ`(r4)
 767         $UMULL          r7,r5,r6
 768         $UMULH          r8,r5,r6
 769         addc            r11,r7,r11
 770         adde            r9,r8,r9
 771         addze           r10,r0
 772         addc            r11,r7,r11
 773         adde            r9,r8,r9
 774         addze           r10,r10
 775                                                 #sqr_add_c2(a,6,5,c3,c1,c2);
 776         $LD             r5,`5*$BNSZ`(r4)
 777         $LD             r6,`6*$BNSZ`(r4)
 778         $UMULL          r7,r5,r6
 779         $UMULH          r8,r5,r6
 780         addc            r11,r7,r11
 781         adde            r9,r8,r9
 782         addze           r10,r10
 783         addc            r11,r7,r11
 784         adde            r9,r8,r9
 785         addze           r10,r10
 786         $ST             r11,`11*$BNSZ`(r3)      #r[11]=c3;
 787                                                 #sqr_add_c(a,6,c1,c2,c3);
 788         $UMULL          r7,r6,r6
 789         $UMULH          r8,r6,r6
 790         addc            r9,r7,r9
 791         adde            r10,r8,r10
 792         addze           r11,r0
 793                                                 #sqr_add_c2(a,7,5,c1,c2,c3)
 794         $LD             r6,`7*$BNSZ`(r4)
 795         $UMULL          r7,r5,r6
 796         $UMULH          r8,r5,r6
 797         addc            r9,r7,r9
 798         adde            r10,r8,r10
 799         addze           r11,r11
 800         addc            r9,r7,r9
 801         adde            r10,r8,r10
 802         addze           r11,r11
 803         $ST             r9,`12*$BNSZ`(r3)       #r[12]=c1;
 804
 805                                                 #sqr_add_c2(a,7,6,c2,c3,c1)
 806         $LD             r5,`6*$BNSZ`(r4)
 807         $UMULL          r7,r5,r6
 808         $UMULH          r8,r5,r6
 809         addc            r10,r7,r10
 810         adde            r11,r8,r11
 811         addze           r9,r0
 812         addc            r10,r7,r10
 813         adde            r11,r8,r11
 814         addze           r9,r9
 815         $ST             r10,`13*$BNSZ`(r3)      #r[13]=c2;
 816                                                 #sqr_add_c(a,7,c3,c1,c2);
 817         $UMULL          r7,r6,r6
 818         $UMULH          r8,r6,r6
 819         addc            r11,r7,r11
 820         adde            r9,r8,r9
 821         $ST             r11,`14*$BNSZ`(r3)      #r[14]=c3;
 822         $ST             r9, `15*$BNSZ`(r3)      #r[15]=c1;
 823
 824
 825         blr
 826         .long   0
 827         .byte   0,12,0x14,0,0,0,2,0
 828         .long   0
 829 .size   .bn_sqr_comba8,.-.bn_sqr_comba8
 830
 831 #
 832 #       NOTE:   The following label name should be changed to
 833 #               "bn_mul_comba4" i.e. remove the first dot
 834 #               for the gcc compiler. This should be automatically
 835 #               done in the build
 836 #
 837
 838 .align  4
 839 .bn_mul_comba4:
 840 #
 841 # This is an optimized version of the bn_mul_comba4 routine.
 842 #
 843 # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 844 # r3 contains r
 845 # r4 contains a
 846 # r5 contains b
 847 # r6, r7 are the 2 BN_ULONGs being multiplied.
 848 # r8, r9 are the results of the 32x32 giving 64 multiply.
 849 # r10, r11, r12 are the equivalents of c1, c2, and c3.
 850 #
 851         xor     r0,r0,r0                #r0=0. Used in addze below.
 852                                         #mul_add_c(a[0],b[0],c1,c2,c3);
 853         $LD     r6,`0*$BNSZ`(r4)
 854         $LD     r7,`0*$BNSZ`(r5)
 855         $UMULL  r10,r6,r7
 856         $UMULH  r11,r6,r7
 857         $ST     r10,`0*$BNSZ`(r3)       #r[0]=c1
 858                                         #mul_add_c(a[0],b[1],c2,c3,c1);
 859         $LD     r7,`1*$BNSZ`(r5)
 860         $UMULL  r8,r6,r7
 861         $UMULH  r9,r6,r7
 862         addc    r11,r8,r11
 863         adde    r12,r9,r0
 864         addze   r10,r0
 865                                         #mul_add_c(a[1],b[0],c2,c3,c1);
 866         $LD     r6, `1*$BNSZ`(r4)
 867         $LD     r7, `0*$BNSZ`(r5)
 868         $UMULL  r8,r6,r7
 869         $UMULH  r9,r6,r7
 870         addc    r11,r8,r11
 871         adde    r12,r9,r12
 872         addze   r10,r10
 873         $ST     r11,`1*$BNSZ`(r3)       #r[1]=c2
 874                                         #mul_add_c(a[2],b[0],c3,c1,c2);
 875         $LD     r6,`2*$BNSZ`(r4)
 876         $UMULL  r8,r6,r7
 877         $UMULH  r9,r6,r7
 878         addc    r12,r8,r12
 879         adde    r10,r9,r10
 880         addze   r11,r0
 881                                         #mul_add_c(a[1],b[1],c3,c1,c2);
 882         $LD     r6,`1*$BNSZ`(r4)
 883         $LD     r7,`1*$BNSZ`(r5)
 884         $UMULL  r8,r6,r7
 885         $UMULH  r9,r6,r7
 886         addc    r12,r8,r12
 887         adde    r10,r9,r10
 888         addze   r11,r11
 889                                         #mul_add_c(a[0],b[2],c3,c1,c2);
 890         $LD     r6,`0*$BNSZ`(r4)
 891         $LD     r7,`2*$BNSZ`(r5)
 892         $UMULL  r8,r6,r7
 893         $UMULH  r9,r6,r7
 894         addc    r12,r8,r12
 895         adde    r10,r9,r10
 896         addze   r11,r11
 897         $ST     r12,`2*$BNSZ`(r3)       #r[2]=c3
 898                                         #mul_add_c(a[0],b[3],c1,c2,c3);
 899         $LD     r7,`3*$BNSZ`(r5)
 900         $UMULL  r8,r6,r7
 901         $UMULH  r9,r6,r7
 902         addc    r10,r8,r10
 903         adde    r11,r9,r11
 904         addze   r12,r0
 905                                         #mul_add_c(a[1],b[2],c1,c2,c3);
 906         $LD     r6,`1*$BNSZ`(r4)
 907         $LD     r7,`2*$BNSZ`(r5)
 908         $UMULL  r8,r6,r7
 909         $UMULH  r9,r6,r7
 910         addc    r10,r8,r10
 911         adde    r11,r9,r11
 912         addze   r12,r12
 913                                         #mul_add_c(a[2],b[1],c1,c2,c3);
 914         $LD     r6,`2*$BNSZ`(r4)
 915         $LD     r7,`1*$BNSZ`(r5)
 916         $UMULL  r8,r6,r7
 917         $UMULH  r9,r6,r7
 918         addc    r10,r8,r10
 919         adde    r11,r9,r11
 920         addze   r12,r12
 921                                         #mul_add_c(a[3],b[0],c1,c2,c3);
 922         $LD     r6,`3*$BNSZ`(r4)
 923         $LD     r7,`0*$BNSZ`(r5)
 924         $UMULL  r8,r6,r7
 925         $UMULH  r9,r6,r7
 926         addc    r10,r8,r10
 927         adde    r11,r9,r11
 928         addze   r12,r12
 929         $ST     r10,`3*$BNSZ`(r3)       #r[3]=c1
 930                                         #mul_add_c(a[3],b[1],c2,c3,c1);
 931         $LD     r7,`1*$BNSZ`(r5)
 932         $UMULL  r8,r6,r7
 933         $UMULH  r9,r6,r7
 934         addc    r11,r8,r11
 935         adde    r12,r9,r12
 936         addze   r10,r0
 937                                         #mul_add_c(a[2],b[2],c2,c3,c1);
 938         $LD     r6,`2*$BNSZ`(r4)
 939         $LD     r7,`2*$BNSZ`(r5)
 940         $UMULL  r8,r6,r7
 941         $UMULH  r9,r6,r7
 942         addc    r11,r8,r11
 943         adde    r12,r9,r12
 944         addze   r10,r10
 945                                         #mul_add_c(a[1],b[3],c2,c3,c1);
 946         $LD     r6,`1*$BNSZ`(r4)
 947         $LD     r7,`3*$BNSZ`(r5)
 948         $UMULL  r8,r6,r7
 949         $UMULH  r9,r6,r7
 950         addc    r11,r8,r11
 951         adde    r12,r9,r12
 952         addze   r10,r10
 953         $ST     r11,`4*$BNSZ`(r3)       #r[4]=c2
 954                                         #mul_add_c(a[2],b[3],c3,c1,c2);
 955         $LD     r6,`2*$BNSZ`(r4)
 956         $UMULL  r8,r6,r7
 957         $UMULH  r9,r6,r7
 958         addc    r12,r8,r12
 959         adde    r10,r9,r10
 960         addze   r11,r0
 961                                         #mul_add_c(a[3],b[2],c3,c1,c2);
 962         $LD     r6,`3*$BNSZ`(r4)
 963         $LD     r7,`2*$BNSZ`(r5)
 964         $UMULL  r8,r6,r7
 965         $UMULH  r9,r6,r7
 966         addc    r12,r8,r12
 967         adde    r10,r9,r10
 968         addze   r11,r11
 969         $ST     r12,`5*$BNSZ`(r3)       #r[5]=c3
 970                                         #mul_add_c(a[3],b[3],c1,c2,c3);
 971         $LD     r7,`3*$BNSZ`(r5)
 972         $UMULL  r8,r6,r7
 973         $UMULH  r9,r6,r7
 974         addc    r10,r8,r10
 975         adde    r11,r9,r11
 976
 977         $ST     r10,`6*$BNSZ`(r3)       #r[6]=c1
 978         $ST     r11,`7*$BNSZ`(r3)       #r[7]=c2
 979         blr
 980         .long   0
 981         .byte   0,12,0x14,0,0,0,3,0
 982         .long   0
 983 .size   .bn_mul_comba4,.-.bn_mul_comba4
 984
 985 #
 986 #       NOTE:   The following label name should be changed to
 987 #               "bn_mul_comba8" i.e. remove the first dot
 988 #               for the gcc compiler. This should be automatically
 989 #               done in the build
 990 #
 991
 992 .align  4
 993 .bn_mul_comba8:
 994 #
 995 # Optimized version of the bn_mul_comba8 routine.
 996 #
 997 # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 998 # r3 contains r
 999 # r4 contains a
1000 # r5 contains b
1001 # r6, r7 are the 2 BN_ULONGs being multiplied.
1002 # r8, r9 are the results of the 32x32 giving 64 multiply.
1003 # r10, r11, r12 are the equivalents of c1, c2, and c3.
1004 #
1005         xor     r0,r0,r0                #r0=0. Used in addze below.
1006
1007                                         #mul_add_c(a[0],b[0],c1,c2,c3);
1008         $LD     r6,`0*$BNSZ`(r4)        #a[0]
1009         $LD     r7,`0*$BNSZ`(r5)        #b[0]
1010         $UMULL  r10,r6,r7
1011         $UMULH  r11,r6,r7
1012         $ST     r10,`0*$BNSZ`(r3)       #r[0]=c1;
1013                                         #mul_add_c(a[0],b[1],c2,c3,c1);
1014         $LD     r7,`1*$BNSZ`(r5)
1015         $UMULL  r8,r6,r7
1016         $UMULH  r9,r6,r7
1017         addc    r11,r11,r8
1018         addze   r12,r9                  # since we didn't set r12 to zero before.
1019         addze   r10,r0
1020                                         #mul_add_c(a[1],b[0],c2,c3,c1);
1021         $LD     r6,`1*$BNSZ`(r4)
1022         $LD     r7,`0*$BNSZ`(r5)
1023         $UMULL  r8,r6,r7
1024         $UMULH  r9,r6,r7
1025         addc    r11,r11,r8
1026         adde    r12,r12,r9
1027         addze   r10,r10
1028         $ST     r11,`1*$BNSZ`(r3)       #r[1]=c2;
1029                                         #mul_add_c(a[2],b[0],c3,c1,c2);
1030         $LD     r6,`2*$BNSZ`(r4)
1031         $UMULL  r8,r6,r7
1032         $UMULH  r9,r6,r7
1033         addc    r12,r12,r8
1034         adde    r10,r10,r9
1035         addze   r11,r0
1036                                         #mul_add_c(a[1],b[1],c3,c1,c2);
1037         $LD     r6,`1*$BNSZ`(r4)
1038         $LD     r7,`1*$BNSZ`(r5)
1039         $UMULL  r8,r6,r7
1040         $UMULH  r9,r6,r7
1041         addc    r12,r12,r8
1042         adde    r10,r10,r9
1043         addze   r11,r11
1044                                         #mul_add_c(a[0],b[2],c3,c1,c2);
1045         $LD     r6,`0*$BNSZ`(r4)
1046         $LD     r7,`2*$BNSZ`(r5)
1047         $UMULL  r8,r6,r7
1048         $UMULH  r9,r6,r7
1049         addc    r12,r12,r8
1050         adde    r10,r10,r9
1051         addze   r11,r11
1052         $ST     r12,`2*$BNSZ`(r3)       #r[2]=c3;
1053                                         #mul_add_c(a[0],b[3],c1,c2,c3);
1054         $LD     r7,`3*$BNSZ`(r5)
1055         $UMULL  r8,r6,r7
1056         $UMULH  r9,r6,r7
1057         addc    r10,r10,r8
1058         adde    r11,r11,r9
1059         addze   r12,r0
1060                                         #mul_add_c(a[1],b[2],c1,c2,c3);
1061         $LD     r6,`1*$BNSZ`(r4)
1062         $LD     r7,`2*$BNSZ`(r5)
1063         $UMULL  r8,r6,r7
1064         $UMULH  r9,r6,r7
1065         addc    r10,r10,r8
1066         adde    r11,r11,r9
1067         addze   r12,r12
1068
1069                                         #mul_add_c(a[2],b[1],c1,c2,c3);
1070         $LD     r6,`2*$BNSZ`(r4)
1071         $LD     r7,`1*$BNSZ`(r5)
1072         $UMULL  r8,r6,r7
1073         $UMULH  r9,r6,r7
1074         addc    r10,r10,r8
1075         adde    r11,r11,r9
1076         addze   r12,r12
1077                                         #mul_add_c(a[3],b[0],c1,c2,c3);
1078         $LD     r6,`3*$BNSZ`(r4)
1079         $LD     r7,`0*$BNSZ`(r5)
1080         $UMULL  r8,r6,r7
1081         $UMULH  r9,r6,r7
1082         addc    r10,r10,r8
1083         adde    r11,r11,r9
1084         addze   r12,r12
1085         $ST     r10,`3*$BNSZ`(r3)       #r[3]=c1;
1086                                         #mul_add_c(a[4],b[0],c2,c3,c1);
1087         $LD     r6,`4*$BNSZ`(r4)
1088         $UMULL  r8,r6,r7
1089         $UMULH  r9,r6,r7
1090         addc    r11,r11,r8
1091         adde    r12,r12,r9
1092         addze   r10,r0
1093                                         #mul_add_c(a[3],b[1],c2,c3,c1);
1094         $LD     r6,`3*$BNSZ`(r4)
1095         $LD     r7,`1*$BNSZ`(r5)
1096         $UMULL  r8,r6,r7
1097         $UMULH  r9,r6,r7
1098         addc    r11,r11,r8
1099         adde    r12,r12,r9
1100         addze   r10,r10
1101                                         #mul_add_c(a[2],b[2],c2,c3,c1);
1102         $LD     r6,`2*$BNSZ`(r4)
1103         $LD     r7,`2*$BNSZ`(r5)
1104         $UMULL  r8,r6,r7
1105         $UMULH  r9,r6,r7
1106         addc    r11,r11,r8
1107         adde    r12,r12,r9
1108         addze   r10,r10
1109                                         #mul_add_c(a[1],b[3],c2,c3,c1);
1110         $LD     r6,`1*$BNSZ`(r4)
1111         $LD     r7,`3*$BNSZ`(r5)
1112         $UMULL  r8,r6,r7
1113         $UMULH  r9,r6,r7
1114         addc    r11,r11,r8
1115         adde    r12,r12,r9
1116         addze   r10,r10
1117                                         #mul_add_c(a[0],b[4],c2,c3,c1);
1118         $LD     r6,`0*$BNSZ`(r4)
1119         $LD     r7,`4*$BNSZ`(r5)
1120         $UMULL  r8,r6,r7
1121         $UMULH  r9,r6,r7
1122         addc    r11,r11,r8
1123         adde    r12,r12,r9
1124         addze   r10,r10
1125         $ST     r11,`4*$BNSZ`(r3)       #r[4]=c2;
1126                                         #mul_add_c(a[0],b[5],c3,c1,c2);
1127         $LD     r7,`5*$BNSZ`(r5)
1128         $UMULL  r8,r6,r7
1129         $UMULH  r9,r6,r7
1130         addc    r12,r12,r8
1131         adde    r10,r10,r9
1132         addze   r11,r0
1133                                         #mul_add_c(a[1],b[4],c3,c1,c2);
1134         $LD     r6,`1*$BNSZ`(r4)
1135         $LD     r7,`4*$BNSZ`(r5)
1136         $UMULL  r8,r6,r7
1137         $UMULH  r9,r6,r7
1138         addc    r12,r12,r8
1139         adde    r10,r10,r9
1140         addze   r11,r11
1141                                         #mul_add_c(a[2],b[3],c3,c1,c2);
1142         $LD     r6,`2*$BNSZ`(r4)
1143         $LD     r7,`3*$BNSZ`(r5)
1144         $UMULL  r8,r6,r7
1145         $UMULH  r9,r6,r7
1146         addc    r12,r12,r8
1147         adde    r10,r10,r9
1148         addze   r11,r11
1149                                         #mul_add_c(a[3],b[2],c3,c1,c2);
1150         $LD     r6,`3*$BNSZ`(r4)
1151         $LD     r7,`2*$BNSZ`(r5)
1152         $UMULL  r8,r6,r7
1153         $UMULH  r9,r6,r7
1154         addc    r12,r12,r8
1155         adde    r10,r10,r9
1156         addze   r11,r11
1157                                         #mul_add_c(a[4],b[1],c3,c1,c2);
1158         $LD     r6,`4*$BNSZ`(r4)
1159         $LD     r7,`1*$BNSZ`(r5)
1160         $UMULL  r8,r6,r7
1161         $UMULH  r9,r6,r7
1162         addc    r12,r12,r8
1163         adde    r10,r10,r9
1164         addze   r11,r11
1165                                         #mul_add_c(a[5],b[0],c3,c1,c2);
1166         $LD     r6,`5*$BNSZ`(r4)
1167         $LD     r7,`0*$BNSZ`(r5)
1168         $UMULL  r8,r6,r7
1169         $UMULH  r9,r6,r7
1170         addc    r12,r12,r8
1171         adde    r10,r10,r9
1172         addze   r11,r11
1173         $ST     r12,`5*$BNSZ`(r3)       #r[5]=c3;
1174                                         #mul_add_c(a[6],b[0],c1,c2,c3);
1175         $LD     r6,`6*$BNSZ`(r4)
1176         $UMULL  r8,r6,r7
1177         $UMULH  r9,r6,r7
1178         addc    r10,r10,r8
1179         adde    r11,r11,r9
1180         addze   r12,r0
1181                                         #mul_add_c(a[5],b[1],c1,c2,c3);
1182         $LD     r6,`5*$BNSZ`(r4)
1183         $LD     r7,`1*$BNSZ`(r5)
1184         $UMULL  r8,r6,r7
1185         $UMULH  r9,r6,r7
1186         addc    r10,r10,r8
1187         adde    r11,r11,r9
1188         addze   r12,r12
1189                                         #mul_add_c(a[4],b[2],c1,c2,c3);
1190         $LD     r6,`4*$BNSZ`(r4)
1191         $LD     r7,`2*$BNSZ`(r5)
1192         $UMULL  r8,r6,r7
1193         $UMULH  r9,r6,r7
1194         addc    r10,r10,r8
1195         adde    r11,r11,r9
1196         addze   r12,r12
1197                                         #mul_add_c(a[3],b[3],c1,c2,c3);
1198         $LD     r6,`3*$BNSZ`(r4)
1199         $LD     r7,`3*$BNSZ`(r5)
1200         $UMULL  r8,r6,r7
1201         $UMULH  r9,r6,r7
1202         addc    r10,r10,r8
1203         adde    r11,r11,r9
1204         addze   r12,r12
1205                                         #mul_add_c(a[2],b[4],c1,c2,c3);
1206         $LD     r6,`2*$BNSZ`(r4)
1207         $LD     r7,`4*$BNSZ`(r5)
1208         $UMULL  r8,r6,r7
1209         $UMULH  r9,r6,r7
1210         addc    r10,r10,r8
1211         adde    r11,r11,r9
1212         addze   r12,r12
1213                                         #mul_add_c(a[1],b[5],c1,c2,c3);
1214         $LD     r6,`1*$BNSZ`(r4)
1215         $LD     r7,`5*$BNSZ`(r5)
1216         $UMULL  r8,r6,r7
1217         $UMULH  r9,r6,r7
1218         addc    r10,r10,r8
1219         adde    r11,r11,r9
1220         addze   r12,r12
1221                                         #mul_add_c(a[0],b[6],c1,c2,c3);
1222         $LD     r6,`0*$BNSZ`(r4)
1223         $LD     r7,`6*$BNSZ`(r5)
1224         $UMULL  r8,r6,r7
1225         $UMULH  r9,r6,r7
1226         addc    r10,r10,r8
1227         adde    r11,r11,r9
1228         addze   r12,r12
1229         $ST     r10,`6*$BNSZ`(r3)       #r[6]=c1;
1230                                         #mul_add_c(a[0],b[7],c2,c3,c1);
1231         $LD     r7,`7*$BNSZ`(r5)
1232         $UMULL  r8,r6,r7
1233         $UMULH  r9,r6,r7
1234         addc    r11,r11,r8
1235         adde    r12,r12,r9
1236         addze   r10,r0
1237                                         #mul_add_c(a[1],b[6],c2,c3,c1);
1238         $LD     r6,`1*$BNSZ`(r4)
1239         $LD     r7,`6*$BNSZ`(r5)
1240         $UMULL  r8,r6,r7
1241         $UMULH  r9,r6,r7
1242         addc    r11,r11,r8
1243         adde    r12,r12,r9
1244         addze   r10,r10
1245                                         #mul_add_c(a[2],b[5],c2,c3,c1);
1246         $LD     r6,`2*$BNSZ`(r4)
1247         $LD     r7,`5*$BNSZ`(r5)
1248         $UMULL  r8,r6,r7
1249         $UMULH  r9,r6,r7
1250         addc    r11,r11,r8
1251         adde    r12,r12,r9
1252         addze   r10,r10
1253                                         #mul_add_c(a[3],b[4],c2,c3,c1);
1254         $LD     r6,`3*$BNSZ`(r4)
1255         $LD     r7,`4*$BNSZ`(r5)
1256         $UMULL  r8,r6,r7
1257         $UMULH  r9,r6,r7
1258         addc    r11,r11,r8
1259         adde    r12,r12,r9
1260         addze   r10,r10
1261                                         #mul_add_c(a[4],b[3],c2,c3,c1);
1262         $LD     r6,`4*$BNSZ`(r4)
1263         $LD     r7,`3*$BNSZ`(r5)
1264         $UMULL  r8,r6,r7
1265         $UMULH  r9,r6,r7
1266         addc    r11,r11,r8
1267         adde    r12,r12,r9
1268         addze   r10,r10
1269                                         #mul_add_c(a[5],b[2],c2,c3,c1);
1270         $LD     r6,`5*$BNSZ`(r4)
1271         $LD     r7,`2*$BNSZ`(r5)
1272         $UMULL  r8,r6,r7
1273         $UMULH  r9,r6,r7
1274         addc    r11,r11,r8
1275         adde    r12,r12,r9
1276         addze   r10,r10
1277                                         #mul_add_c(a[6],b[1],c2,c3,c1);
1278         $LD     r6,`6*$BNSZ`(r4)
1279         $LD     r7,`1*$BNSZ`(r5)
1280         $UMULL  r8,r6,r7
1281         $UMULH  r9,r6,r7
1282         addc    r11,r11,r8
1283         adde    r12,r12,r9
1284         addze   r10,r10
1285                                         #mul_add_c(a[7],b[0],c2,c3,c1);
1286         $LD     r6,`7*$BNSZ`(r4)
1287         $LD     r7,`0*$BNSZ`(r5)
1288         $UMULL  r8,r6,r7
1289         $UMULH  r9,r6,r7
1290         addc    r11,r11,r8
1291         adde    r12,r12,r9
1292         addze   r10,r10
1293         $ST     r11,`7*$BNSZ`(r3)       #r[7]=c2;
1294                                         #mul_add_c(a[7],b[1],c3,c1,c2);
1295         $LD     r7,`1*$BNSZ`(r5)
1296         $UMULL  r8,r6,r7
1297         $UMULH  r9,r6,r7
1298         addc    r12,r12,r8
1299         adde    r10,r10,r9
1300         addze   r11,r0
1301                                         #mul_add_c(a[6],b[2],c3,c1,c2);
1302         $LD     r6,`6*$BNSZ`(r4)
1303         $LD     r7,`2*$BNSZ`(r5)
1304         $UMULL  r8,r6,r7
1305         $UMULH  r9,r6,r7
1306         addc    r12,r12,r8
1307         adde    r10,r10,r9
1308         addze   r11,r11
1309                                         #mul_add_c(a[5],b[3],c3,c1,c2);
1310         $LD     r6,`5*$BNSZ`(r4)
1311         $LD     r7,`3*$BNSZ`(r5)
1312         $UMULL  r8,r6,r7
1313         $UMULH  r9,r6,r7
1314         addc    r12,r12,r8
1315         adde    r10,r10,r9
1316         addze   r11,r11
1317                                         #mul_add_c(a[4],b[4],c3,c1,c2);
1318         $LD     r6,`4*$BNSZ`(r4)
1319         $LD     r7,`4*$BNSZ`(r5)
1320         $UMULL  r8,r6,r7
1321         $UMULH  r9,r6,r7
1322         addc    r12,r12,r8
1323         adde    r10,r10,r9
1324         addze   r11,r11
1325                                         #mul_add_c(a[3],b[5],c3,c1,c2);
1326         $LD     r6,`3*$BNSZ`(r4)
1327         $LD     r7,`5*$BNSZ`(r5)
1328         $UMULL  r8,r6,r7
1329         $UMULH  r9,r6,r7
1330         addc    r12,r12,r8
1331         adde    r10,r10,r9
1332         addze   r11,r11
1333                                         #mul_add_c(a[2],b[6],c3,c1,c2);
1334         $LD     r6,`2*$BNSZ`(r4)
1335         $LD     r7,`6*$BNSZ`(r5)
1336         $UMULL  r8,r6,r7
1337         $UMULH  r9,r6,r7
1338         addc    r12,r12,r8
1339         adde    r10,r10,r9
1340         addze   r11,r11
1341                                         #mul_add_c(a[1],b[7],c3,c1,c2);
1342         $LD     r6,`1*$BNSZ`(r4)
1343         $LD     r7,`7*$BNSZ`(r5)
1344         $UMULL  r8,r6,r7
1345         $UMULH  r9,r6,r7
1346         addc    r12,r12,r8
1347         adde    r10,r10,r9
1348         addze   r11,r11
1349         $ST     r12,`8*$BNSZ`(r3)       #r[8]=c3;
1350                                         #mul_add_c(a[2],b[7],c1,c2,c3);
1351         $LD     r6,`2*$BNSZ`(r4)
1352         $UMULL  r8,r6,r7
1353         $UMULH  r9,r6,r7
1354         addc    r10,r10,r8
1355         adde    r11,r11,r9
1356         addze   r12,r0
1357                                         #mul_add_c(a[3],b[6],c1,c2,c3);
1358         $LD     r6,`3*$BNSZ`(r4)
1359         $LD     r7,`6*$BNSZ`(r5)
1360         $UMULL  r8,r6,r7
1361         $UMULH  r9,r6,r7
1362         addc    r10,r10,r8
1363         adde    r11,r11,r9
1364         addze   r12,r12
1365                                         #mul_add_c(a[4],b[5],c1,c2,c3);
1366         $LD     r6,`4*$BNSZ`(r4)
1367         $LD     r7,`5*$BNSZ`(r5)
1368         $UMULL  r8,r6,r7
1369         $UMULH  r9,r6,r7
1370         addc    r10,r10,r8
1371         adde    r11,r11,r9
1372         addze   r12,r12
1373                                         #mul_add_c(a[5],b[4],c1,c2,c3);
1374         $LD     r6,`5*$BNSZ`(r4)
1375         $LD     r7,`4*$BNSZ`(r5)
1376         $UMULL  r8,r6,r7
1377         $UMULH  r9,r6,r7
1378         addc    r10,r10,r8
1379         adde    r11,r11,r9
1380         addze   r12,r12
1381                                         #mul_add_c(a[6],b[3],c1,c2,c3);
1382         $LD     r6,`6*$BNSZ`(r4)
1383         $LD     r7,`3*$BNSZ`(r5)
1384         $UMULL  r8,r6,r7
1385         $UMULH  r9,r6,r7
1386         addc    r10,r10,r8
1387         adde    r11,r11,r9
1388         addze   r12,r12
1389                                         #mul_add_c(a[7],b[2],c1,c2,c3);
1390         $LD     r6,`7*$BNSZ`(r4)
1391         $LD     r7,`2*$BNSZ`(r5)
1392         $UMULL  r8,r6,r7
1393         $UMULH  r9,r6,r7
1394         addc    r10,r10,r8
1395         adde    r11,r11,r9
1396         addze   r12,r12
1397         $ST     r10,`9*$BNSZ`(r3)       #r[9]=c1;
1398                                         #mul_add_c(a[7],b[3],c2,c3,c1);
1399         $LD     r7,`3*$BNSZ`(r5)
1400         $UMULL  r8,r6,r7
1401         $UMULH  r9,r6,r7
1402         addc    r11,r11,r8
1403         adde    r12,r12,r9
1404         addze   r10,r0
1405                                         #mul_add_c(a[6],b[4],c2,c3,c1);
1406         $LD     r6,`6*$BNSZ`(r4)
1407         $LD     r7,`4*$BNSZ`(r5)
1408         $UMULL  r8,r6,r7
1409         $UMULH  r9,r6,r7
1410         addc    r11,r11,r8
1411         adde    r12,r12,r9
1412         addze   r10,r10
1413                                         #mul_add_c(a[5],b[5],c2,c3,c1);
1414         $LD     r6,`5*$BNSZ`(r4)
1415         $LD     r7,`5*$BNSZ`(r5)
1416         $UMULL  r8,r6,r7
1417         $UMULH  r9,r6,r7
1418         addc    r11,r11,r8
1419         adde    r12,r12,r9
1420         addze   r10,r10
1421                                         #mul_add_c(a[4],b[6],c2,c3,c1);
1422         $LD     r6,`4*$BNSZ`(r4)
1423         $LD     r7,`6*$BNSZ`(r5)
1424         $UMULL  r8,r6,r7
1425         $UMULH  r9,r6,r7
1426         addc    r11,r11,r8
1427         adde    r12,r12,r9
1428         addze   r10,r10
1429                                         #mul_add_c(a[3],b[7],c2,c3,c1);
1430         $LD     r6,`3*$BNSZ`(r4)
1431         $LD     r7,`7*$BNSZ`(r5)
1432         $UMULL  r8,r6,r7
1433         $UMULH  r9,r6,r7
1434         addc    r11,r11,r8
1435         adde    r12,r12,r9
1436         addze   r10,r10
1437         $ST     r11,`10*$BNSZ`(r3)      #r[10]=c2;
1438                                         #mul_add_c(a[4],b[7],c3,c1,c2);
1439         $LD     r6,`4*$BNSZ`(r4)
1440         $UMULL  r8,r6,r7
1441         $UMULH  r9,r6,r7
1442         addc    r12,r12,r8
1443         adde    r10,r10,r9
1444         addze   r11,r0
1445                                         #mul_add_c(a[5],b[6],c3,c1,c2);
1446         $LD     r6,`5*$BNSZ`(r4)
1447         $LD     r7,`6*$BNSZ`(r5)
1448         $UMULL  r8,r6,r7
1449         $UMULH  r9,r6,r7
1450         addc    r12,r12,r8
1451         adde    r10,r10,r9
1452         addze   r11,r11
1453                                         #mul_add_c(a[6],b[5],c3,c1,c2);
1454         $LD     r6,`6*$BNSZ`(r4)
1455         $LD     r7,`5*$BNSZ`(r5)
1456         $UMULL  r8,r6,r7
1457         $UMULH  r9,r6,r7
1458         addc    r12,r12,r8
1459         adde    r10,r10,r9
1460         addze   r11,r11
1461                                         #mul_add_c(a[7],b[4],c3,c1,c2);
1462         $LD     r6,`7*$BNSZ`(r4)
1463         $LD     r7,`4*$BNSZ`(r5)
1464         $UMULL  r8,r6,r7
1465         $UMULH  r9,r6,r7
1466         addc    r12,r12,r8
1467         adde    r10,r10,r9
1468         addze   r11,r11
1469         $ST     r12,`11*$BNSZ`(r3)      #r[11]=c3;
1470                                         #mul_add_c(a[7],b[5],c1,c2,c3);
1471         $LD     r7,`5*$BNSZ`(r5)
1472         $UMULL  r8,r6,r7
1473         $UMULH  r9,r6,r7
1474         addc    r10,r10,r8
1475         adde    r11,r11,r9
1476         addze   r12,r0
1477                                         #mul_add_c(a[6],b[6],c1,c2,c3);
1478         $LD     r6,`6*$BNSZ`(r4)
1479         $LD     r7,`6*$BNSZ`(r5)
1480         $UMULL  r8,r6,r7
1481         $UMULH  r9,r6,r7
1482         addc    r10,r10,r8
1483         adde    r11,r11,r9
1484         addze   r12,r12
1485                                         #mul_add_c(a[5],b[7],c1,c2,c3);
1486         $LD     r6,`5*$BNSZ`(r4)
1487         $LD     r7,`7*$BNSZ`(r5)
1488         $UMULL  r8,r6,r7
1489         $UMULH  r9,r6,r7
1490         addc    r10,r10,r8
1491         adde    r11,r11,r9
1492         addze   r12,r12
1493         $ST     r10,`12*$BNSZ`(r3)      #r[12]=c1;
1494                                         #mul_add_c(a[6],b[7],c2,c3,c1);
1495         $LD     r6,`6*$BNSZ`(r4)
1496         $UMULL  r8,r6,r7
1497         $UMULH  r9,r6,r7
1498         addc    r11,r11,r8
1499         adde    r12,r12,r9
1500         addze   r10,r0
1501                                         #mul_add_c(a[7],b[6],c2,c3,c1);
1502         $LD     r6,`7*$BNSZ`(r4)
1503         $LD     r7,`6*$BNSZ`(r5)
1504         $UMULL  r8,r6,r7
1505         $UMULH  r9,r6,r7
1506         addc    r11,r11,r8
1507         adde    r12,r12,r9
1508         addze   r10,r10
1509         $ST     r11,`13*$BNSZ`(r3)      #r[13]=c2;
1510                                         #mul_add_c(a[7],b[7],c3,c1,c2);
1511         $LD     r7,`7*$BNSZ`(r5)
1512         $UMULL  r8,r6,r7
1513         $UMULH  r9,r6,r7
1514         addc    r12,r12,r8
1515         adde    r10,r10,r9
1516         $ST     r12,`14*$BNSZ`(r3)      #r[14]=c3;
1517         $ST     r10,`15*$BNSZ`(r3)      #r[15]=c1;
1518         blr
1519         .long   0
1520         .byte   0,12,0x14,0,0,0,3,0
1521         .long   0
1522 .size   .bn_mul_comba8,.-.bn_mul_comba8
1523
1524 #
1525 #       NOTE:   The following label name should be changed to
1526 #               "bn_sub_words" i.e. remove the first dot
1527 #               for the gcc compiler. This should be automatically
1528 #               done in the build
1529 #
1530 #
1531 .align  4
1532 .bn_sub_words:
1533 #
1534 #       Handcoded version of bn_sub_words
1535 #
1536 #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1537 #
1538 #       r3 = r
1539 #       r4 = a
1540 #       r5 = b
1541 #       r6 = n
1542 #
1543 #       Note:   No loop unrolling done since this is not a performance
1544 #               critical loop.
1545
1546         xor     r0,r0,r0        #set r0 = 0
1547 #
1548 #       check for r6 = 0 AND set carry bit.
1549 #
1550         subfc.  r7,r0,r6        # If r6 is 0 then result is 0.
1551                                 # if r6 > 0 then result !=0
1552                                 # In either case carry bit is set.
1553         beq     Lppcasm_sub_adios
1554         addi    r4,r4,-$BNSZ
1555         addi    r3,r3,-$BNSZ
1556         addi    r5,r5,-$BNSZ
1557         mtctr   r6
1558 Lppcasm_sub_mainloop:
1559         $LDU    r7,$BNSZ(r4)
1560         $LDU    r8,$BNSZ(r5)
1561         subfe   r6,r8,r7        # r6 = r7+carry bit + onescomplement(r8)
1562                                 # if carry = 1 this is r7-r8. Else it
1563                                 # is r7-r8 -1 as we need.
1564         $STU    r6,$BNSZ(r3)
1565         bdnz    Lppcasm_sub_mainloop
1566 Lppcasm_sub_adios:
1567         subfze  r3,r0           # if carry bit is set then r3 = 0 else -1
1568         andi.   r3,r3,1         # keep only last bit.
1569         blr
1570         .long   0
1571         .byte   0,12,0x14,0,0,0,4,0
1572         .long   0
1573 .size   .bn_sub_words,.-.bn_sub_words
1574
1575 #
1576 #       NOTE:   The following label name should be changed to
1577 #               "bn_add_words" i.e. remove the first dot
1578 #               for the gcc compiler. This should be automatically
1579 #               done in the build
1580 #
1581
1582 .align  4
1583 .bn_add_words:
1584 #
1585 #       Handcoded version of bn_add_words
1586 #
1587 #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1588 #
1589 #       r3 = r
1590 #       r4 = a
1591 #       r5 = b
1592 #       r6 = n
1593 #
1594 #       Note:   No loop unrolling done since this is not a performance
1595 #               critical loop.
1596
1597         xor     r0,r0,r0
1598 #
1599 #       check for r6 = 0. Is this needed?
1600 #
1601         addic.  r6,r6,0         #test r6 and clear carry bit.
1602         beq     Lppcasm_add_adios
1603         addi    r4,r4,-$BNSZ
1604         addi    r3,r3,-$BNSZ
1605         addi    r5,r5,-$BNSZ
1606         mtctr   r6
1607 Lppcasm_add_mainloop:
1608         $LDU    r7,$BNSZ(r4)
1609         $LDU    r8,$BNSZ(r5)
1610         adde    r8,r7,r8
1611         $STU    r8,$BNSZ(r3)
1612         bdnz    Lppcasm_add_mainloop
1613 Lppcasm_add_adios:
1614         addze   r3,r0                   #return carry bit.
1615         blr
1616         .long   0
1617         .byte   0,12,0x14,0,0,0,4,0
1618         .long   0
1619 .size   .bn_add_words,.-.bn_add_words
1620
1621 #
1622 #       NOTE:   The following label name should be changed to
1623 #               "bn_div_words" i.e. remove the first dot
1624 #               for the gcc compiler. This should be automatically
1625 #               done in the build
1626 #
1627
1628 .align  4
1629 .bn_div_words:
1630 #
1631 #       This is a cleaned up version of code generated by
1632 #       the AIX compiler. The only optimization is to use
1633 #       the PPC instruction to count leading zeros instead
1634 #       of call to num_bits_word. Since this was compiled
1635 #       only at level -O2 we can possibly squeeze it more?
1636 #
1637 #       r3 = h
1638 #       r4 = l
1639 #       r5 = d
1640
1641         $UCMPI  0,r5,0                  # compare r5 and 0
1642         bne     Lppcasm_div1            # proceed if d!=0
1643         li      r3,-1                   # d=0 return -1
1644         blr
1645 Lppcasm_div1:
1646         xor     r0,r0,r0                #r0=0
1647         li      r8,$BITS
1648         $CNTLZ. r7,r5                   #r7 = num leading 0s in d.
1649         beq     Lppcasm_div2            #proceed if no leading zeros
1650         subf    r8,r7,r8                #r8 = BN_num_bits_word(d)
1651         $SHR.   r9,r3,r8                #are there any bits above r8'th?
1652         $TR     16,r9,r0                #if there're, signal to dump core...
1653 Lppcasm_div2:
1654         $UCMP   0,r3,r5                 #h>=d?
1655         blt     Lppcasm_div3            #goto Lppcasm_div3 if not
1656         subf    r3,r5,r3                #h-=d ;
1657 Lppcasm_div3:                           #r7 = BN_BITS2-i. so r7=i
1658         cmpi    0,0,r7,0                # is (i == 0)?
1659         beq     Lppcasm_div4
1660         $SHL    r3,r3,r7                # h = (h<< i)
1661         $SHR    r8,r4,r8                # r8 = (l >> BN_BITS2 -i)
1662         $SHL    r5,r5,r7                # d<<=i
1663         or      r3,r3,r8                # h = (h<<i)|(l>>(BN_BITS2-i))
1664         $SHL    r4,r4,r7                # l <<=i
1665 Lppcasm_div4:
1666         $SHRI   r9,r5,`$BITS/2`         # r9 = dh
1667                                         # dl will be computed when needed
1668                                         # as it saves registers.
1669         li      r6,2                    #r6=2
1670         mtctr   r6                      #counter will be in count.
1671 Lppcasm_divouterloop:
1672         $SHRI   r8,r3,`$BITS/2`         #r8 = (h>>BN_BITS4)
1673         $SHRI   r11,r4,`$BITS/2`        #r11= (l&BN_MASK2h)>>BN_BITS4
1674                                         # compute here for innerloop.
1675         $UCMP   0,r8,r9                 # is (h>>BN_BITS4)==dh
1676         bne     Lppcasm_div5            # goto Lppcasm_div5 if not
1677
1678         li      r8,-1
1679         $CLRU   r8,r8,`$BITS/2`         #q = BN_MASK2l
1680         b       Lppcasm_div6
1681 Lppcasm_div5:
1682         $UDIV   r8,r3,r9                #q = h/dh
1683 Lppcasm_div6:
1684         $UMULL  r12,r9,r8               #th = q*dh
1685         $CLRU   r10,r5,`$BITS/2`        #r10=dl
1686         $UMULL  r6,r8,r10               #tl = q*dl
1687
1688 Lppcasm_divinnerloop:
1689         subf    r10,r12,r3              #t = h -th
1690         $SHRI   r7,r10,`$BITS/2`        #r7= (t &BN_MASK2H), sort of...
1691         addic.  r7,r7,0                 #test if r7 == 0. used below.
1692                                         # now want to compute
1693                                         # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1694                                         # the following 2 instructions do that
1695         $SHLI   r7,r10,`$BITS/2`        # r7 = (t<<BN_BITS4)
1696         or      r7,r7,r11               # r7|=((l&BN_MASK2h)>>BN_BITS4)
1697         $UCMP   cr1,r6,r7               # compare (tl <= r7)
1698         bne     Lppcasm_divinnerexit
1699         ble     cr1,Lppcasm_divinnerexit
1700         addi    r8,r8,-1                #q--
1701         subf    r12,r9,r12              #th -=dh
1702         $CLRU   r10,r5,`$BITS/2`        #r10=dl. t is no longer needed in loop.
1703         subf    r6,r10,r6               #tl -=dl
1704         b       Lppcasm_divinnerloop
1705 Lppcasm_divinnerexit:
1706         $SHRI   r10,r6,`$BITS/2`        #t=(tl>>BN_BITS4)
1707         $SHLI   r11,r6,`$BITS/2`        #tl=(tl<<BN_BITS4)&BN_MASK2h;
1708         $UCMP   cr1,r4,r11              # compare l and tl
1709         add     r12,r12,r10             # th+=t
1710         bge     cr1,Lppcasm_div7        # if (l>=tl) goto Lppcasm_div7
1711         addi    r12,r12,1               # th++
1712 Lppcasm_div7:
1713         subf    r11,r11,r4              #r11=l-tl
1714         $UCMP   cr1,r3,r12              #compare h and th
1715         bge     cr1,Lppcasm_div8        #if (h>=th) goto Lppcasm_div8
1716         addi    r8,r8,-1                # q--
1717         add     r3,r5,r3                # h+=d
1718 Lppcasm_div8:
1719         subf    r12,r12,r3              #r12 = h-th
1720         $SHLI   r4,r11,`$BITS/2`        #l=(l&BN_MASK2l)<<BN_BITS4
1721                                         # want to compute
1722                                         # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1723                                         # the following 2 instructions will do this.
1724         $INSR   r11,r12,`$BITS/2`,`$BITS/2`     # r11 is the value we want rotated $BITS/2.
1725         $ROTL   r3,r11,`$BITS/2`        # rotate by $BITS/2 and store in r3
1726         bdz     Lppcasm_div9            #if (count==0) break ;
1727         $SHLI   r0,r8,`$BITS/2`         #ret =q<<BN_BITS4
1728         b       Lppcasm_divouterloop
1729 Lppcasm_div9:
1730         or      r3,r8,r0
1731         blr
1732         .long   0
1733         .byte   0,12,0x14,0,0,0,3,0
1734         .long   0
1735 .size   .bn_div_words,.-.bn_div_words
1736
1737 #
1738 #       NOTE:   The following label name should be changed to
1739 #               "bn_sqr_words" i.e. remove the first dot
1740 #               for the gcc compiler. This should be automatically
1741 #               done in the build
1742 #
1743 .align  4
1744 .bn_sqr_words:
1745 #
1746 #       Optimized version of bn_sqr_words
1747 #
1748 #       void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1749 #
1750 #       r3 = r
1751 #       r4 = a
1752 #       r5 = n
1753 #
1754 #       r6 = a[i].
1755 #       r7,r8 = product.
1756 #
1757 #       No unrolling done here. Not performance critical.
1758
1759         addic.  r5,r5,0                 #test r5.
1760         beq     Lppcasm_sqr_adios
1761         addi    r4,r4,-$BNSZ
1762         addi    r3,r3,-$BNSZ
1763         mtctr   r5
1764 Lppcasm_sqr_mainloop:
1765                                         #sqr(r[0],r[1],a[0]);
1766         $LDU    r6,$BNSZ(r4)
1767         $UMULL  r7,r6,r6
1768         $UMULH  r8,r6,r6
1769         $STU    r7,$BNSZ(r3)
1770         $STU    r8,$BNSZ(r3)
1771         bdnz    Lppcasm_sqr_mainloop
1772 Lppcasm_sqr_adios:
1773         blr
1774         .long   0
1775         .byte   0,12,0x14,0,0,0,3,0
1776         .long   0
1777 .size   .bn_sqr_words,.-.bn_sqr_words
1778
1779 #
1780 #       NOTE:   The following label name should be changed to
1781 #               "bn_mul_words" i.e. remove the first dot
1782 #               for the gcc compiler. This should be automatically
1783 #               done in the build
1784 #
1785
1786 .align  4
1787 .bn_mul_words:
1788 #
1789 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1790 #
1791 # r3 = rp
1792 # r4 = ap
1793 # r5 = num
1794 # r6 = w
1795         xor     r0,r0,r0
1796         xor     r12,r12,r12             # used for carry
1797         rlwinm. r7,r5,30,2,31           # num >> 2
1798         beq     Lppcasm_mw_REM
1799         mtctr   r7
1800 Lppcasm_mw_LOOP:
1801                                         #mul(rp[0],ap[0],w,c1);
1802         $LD     r8,`0*$BNSZ`(r4)
1803         $UMULL  r9,r6,r8
1804         $UMULH  r10,r6,r8
1805         addc    r9,r9,r12
1806         #addze  r10,r10                 #carry is NOT ignored.
1807                                         #will be taken care of
1808                                         #in second spin below
1809                                         #using adde.
1810         $ST     r9,`0*$BNSZ`(r3)
1811                                         #mul(rp[1],ap[1],w,c1);
1812         $LD     r8,`1*$BNSZ`(r4)
1813         $UMULL  r11,r6,r8
1814         $UMULH  r12,r6,r8
1815         adde    r11,r11,r10
1816         #addze  r12,r12
1817         $ST     r11,`1*$BNSZ`(r3)
1818                                         #mul(rp[2],ap[2],w,c1);
1819         $LD     r8,`2*$BNSZ`(r4)
1820         $UMULL  r9,r6,r8
1821         $UMULH  r10,r6,r8
1822         adde    r9,r9,r12
1823         #addze  r10,r10
1824         $ST     r9,`2*$BNSZ`(r3)
1825                                         #mul_add(rp[3],ap[3],w,c1);
1826         $LD     r8,`3*$BNSZ`(r4)
1827         $UMULL  r11,r6,r8
1828         $UMULH  r12,r6,r8
1829         adde    r11,r11,r10
1830         addze   r12,r12                 #this spin we collect carry into
1831                                         #r12
1832         $ST     r11,`3*$BNSZ`(r3)
1833
1834         addi    r3,r3,`4*$BNSZ`
1835         addi    r4,r4,`4*$BNSZ`
1836         bdnz    Lppcasm_mw_LOOP
1837
1838 Lppcasm_mw_REM:
1839         andi.   r5,r5,0x3
1840         beq     Lppcasm_mw_OVER
1841                                         #mul(rp[0],ap[0],w,c1);
1842         $LD     r8,`0*$BNSZ`(r4)
1843         $UMULL  r9,r6,r8
1844         $UMULH  r10,r6,r8
1845         addc    r9,r9,r12
1846         addze   r10,r10
1847         $ST     r9,`0*$BNSZ`(r3)
1848         addi    r12,r10,0
1849
1850         addi    r5,r5,-1
1851         cmpli   0,0,r5,0
1852         beq     Lppcasm_mw_OVER
1853
1854
1855                                         #mul(rp[1],ap[1],w,c1);
1856         $LD     r8,`1*$BNSZ`(r4)
1857         $UMULL  r9,r6,r8
1858         $UMULH  r10,r6,r8
1859         addc    r9,r9,r12
1860         addze   r10,r10
1861         $ST     r9,`1*$BNSZ`(r3)
1862         addi    r12,r10,0
1863
1864         addi    r5,r5,-1
1865         cmpli   0,0,r5,0
1866         beq     Lppcasm_mw_OVER
1867
1868                                         #mul_add(rp[2],ap[2],w,c1);
1869         $LD     r8,`2*$BNSZ`(r4)
1870         $UMULL  r9,r6,r8
1871         $UMULH  r10,r6,r8
1872         addc    r9,r9,r12
1873         addze   r10,r10
1874         $ST     r9,`2*$BNSZ`(r3)
1875         addi    r12,r10,0
1876
1877 Lppcasm_mw_OVER:
1878         addi    r3,r12,0
1879         blr
1880         .long   0
1881         .byte   0,12,0x14,0,0,0,4,0
1882         .long   0
1883 .size   .bn_mul_words,.-.bn_mul_words
1884
1885 #
1886 #       NOTE:   The following label name should be changed to
1887 #               "bn_mul_add_words" i.e. remove the first dot
1888 #               for the gcc compiler. This should be automatically
1889 #               done in the build
1890 #
1891
1892 .align  4
1893 .bn_mul_add_words:
1894 #
1895 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1896 #
1897 # r3 = rp
1898 # r4 = ap
1899 # r5 = num
1900 # r6 = w
1901 #
1902 # empirical evidence suggests that unrolled version performs best!!
1903 #
1904         xor     r0,r0,r0                #r0 = 0
1905         xor     r12,r12,r12             #r12 = 0 . used for carry
1906         rlwinm. r7,r5,30,2,31           # num >> 2
1907         beq     Lppcasm_maw_leftover    # if (num < 4) go LPPCASM_maw_leftover
1908         mtctr   r7
1909 Lppcasm_maw_mainloop:
1910                                         #mul_add(rp[0],ap[0],w,c1);
1911         $LD     r8,`0*$BNSZ`(r4)
1912         $LD     r11,`0*$BNSZ`(r3)
1913         $UMULL  r9,r6,r8
1914         $UMULH  r10,r6,r8
1915         addc    r9,r9,r12               #r12 is carry.
1916         addze   r10,r10
1917         addc    r9,r9,r11
1918         #addze  r10,r10
1919                                         #the above instruction addze
1920                                         #is NOT needed. Carry will NOT
1921                                         #be ignored. It's not affected
1922                                         #by multiply and will be collected
1923                                         #in the next spin
1924         $ST     r9,`0*$BNSZ`(r3)
1925
1926                                         #mul_add(rp[1],ap[1],w,c1);
1927         $LD     r8,`1*$BNSZ`(r4)
1928         $LD     r9,`1*$BNSZ`(r3)
1929         $UMULL  r11,r6,r8
1930         $UMULH  r12,r6,r8
1931         adde    r11,r11,r10             #r10 is carry.
1932         addze   r12,r12
1933         addc    r11,r11,r9
1934         #addze  r12,r12
1935         $ST     r11,`1*$BNSZ`(r3)
1936
1937                                         #mul_add(rp[2],ap[2],w,c1);
1938         $LD     r8,`2*$BNSZ`(r4)
1939         $UMULL  r9,r6,r8
1940         $LD     r11,`2*$BNSZ`(r3)
1941         $UMULH  r10,r6,r8
1942         adde    r9,r9,r12
1943         addze   r10,r10
1944         addc    r9,r9,r11
1945         #addze  r10,r10
1946         $ST     r9,`2*$BNSZ`(r3)
1947
1948                                         #mul_add(rp[3],ap[3],w,c1);
1949         $LD     r8,`3*$BNSZ`(r4)
1950         $UMULL  r11,r6,r8
1951         $LD     r9,`3*$BNSZ`(r3)
1952         $UMULH  r12,r6,r8
1953         adde    r11,r11,r10
1954         addze   r12,r12
1955         addc    r11,r11,r9
1956         addze   r12,r12
1957         $ST     r11,`3*$BNSZ`(r3)
1958         addi    r3,r3,`4*$BNSZ`
1959         addi    r4,r4,`4*$BNSZ`
1960         bdnz    Lppcasm_maw_mainloop
1961
1962 Lppcasm_maw_leftover:
1963         andi.   r5,r5,0x3
1964         beq     Lppcasm_maw_adios
1965         addi    r3,r3,-$BNSZ
1966         addi    r4,r4,-$BNSZ
1967                                         #mul_add(rp[0],ap[0],w,c1);
1968         mtctr   r5
1969         $LDU    r8,$BNSZ(r4)
1970         $UMULL  r9,r6,r8
1971         $UMULH  r10,r6,r8
1972         $LDU    r11,$BNSZ(r3)
1973         addc    r9,r9,r11
1974         addze   r10,r10
1975         addc    r9,r9,r12
1976         addze   r12,r10
1977         $ST     r9,0(r3)
1978
1979         bdz     Lppcasm_maw_adios
1980                                         #mul_add(rp[1],ap[1],w,c1);
1981         $LDU    r8,$BNSZ(r4)
1982         $UMULL  r9,r6,r8
1983         $UMULH  r10,r6,r8
1984         $LDU    r11,$BNSZ(r3)
1985         addc    r9,r9,r11
1986         addze   r10,r10
1987         addc    r9,r9,r12
1988         addze   r12,r10
1989         $ST     r9,0(r3)
1990
1991         bdz     Lppcasm_maw_adios
1992                                         #mul_add(rp[2],ap[2],w,c1);
1993         $LDU    r8,$BNSZ(r4)
1994         $UMULL  r9,r6,r8
1995         $UMULH  r10,r6,r8
1996         $LDU    r11,$BNSZ(r3)
1997         addc    r9,r9,r11
1998         addze   r10,r10
1999         addc    r9,r9,r12
2000         addze   r12,r10
2001         $ST     r9,0(r3)
2002
2003 Lppcasm_maw_adios:
2004         addi    r3,r12,0
2005         blr
2006         .long   0
2007         .byte   0,12,0x14,0,0,0,4,0
2008         .long   0
2009 .size   .bn_mul_add_words,.-.bn_mul_add_words
2010         .align  4
2011 EOF
2012 $data =~ s/\`([^\`]*)\`/eval $1/gem;
2013 print $data;
2014 close STDOUT;