crypto/bn/asm/ppc.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2004-2018 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 # Implemented as a Perl wrapper as we want to support several different
  10 # architectures with single file. We pick up the target based on the
  11 # file name we are asked to generate.
  12 #
  13 # It should be noted though that this perl code is nothing like
  14 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
  15 # as pre-processor to cover for platform differences in name decoration,
  16 # linker tables, 32-/64-bit instruction sets...
  17 #
  18 # As you might know there're several PowerPC ABI in use. Most notably
  19 # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
  20 # are similar enough to implement leaf(!) functions, which would be ABI
  21 # neutral. And that's what you find here: ABI neutral leaf functions.
  22 # In case you wonder what that is...
  23 #
  24 #       AIX performance
  25 #
  26 #       MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
  27 #
  28 #       The following is the performance of 32-bit compiler
  29 #       generated code:
  30 #
  31 #       OpenSSL 0.9.6c 21 dec 2001
  32 #       built on: Tue Jun 11 11:06:51 EDT 2002
  33 #       options:bn(64,32) ...
  34 #compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
  35 #                  sign    verify    sign/s verify/s
  36 #rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
  37 #rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
  38 #rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
  39 #rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
  40 #dsa  512 bits   0.0087s   0.0106s    114.3     94.5
  41 #dsa 1024 bits   0.0256s   0.0313s     39.0     32.0
  42 #
  43 #       Same benchmark with this assembler code:
  44 #
  45 #rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
  46 #rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
  47 #rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
  48 #rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
  49 #dsa  512 bits   0.0052s   0.0062s    191.6    162.0
  50 #dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
  51 #
  52 #       Number of operations increases by at almost 75%
  53 #
  54 #       Here are performance numbers for 64-bit compiler
  55 #       generated code:
  56 #
  57 #       OpenSSL 0.9.6g [engine] 9 Aug 2002
  58 #       built on: Fri Apr 18 16:59:20 EDT 2003
  59 #       options:bn(64,64) ...
  60 #       compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
  61 #                  sign    verify    sign/s verify/s
  62 #rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
  63 #rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
  64 #rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
  65 #rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
  66 #dsa  512 bits   0.0026s   0.0032s    382.5    313.7
  67 #dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
  68 #
  69 #       Same benchmark with this assembler code:
  70 #
  71 #rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
  72 #rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
  73 #rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
  74 #rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
  75 #dsa  512 bits   0.0016s   0.0020s    610.7    507.1
  76 #dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
  77 #
  78 #       Again, performance increases by at about 75%
  79 #
  80 #       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
  81 #       OpenSSL 0.9.7c 30 Sep 2003
  82 #
  83 #       Original code.
  84 #
  85 #rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
  86 #rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
  87 #rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
  88 #rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
  89 #dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
  90 #dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
  91 #dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
  92 #
  93 #       Same benchmark with this assembler code:
  94 #
  95 #rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
  96 #rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
  97 #rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
  98 #rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
  99 #dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
 100 #dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
 101 #dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
 102 #
 103 #        Performance increase of ~60%
 104 #        Based on submission from Suresh N. Chari of IBM
 105
 106 $flavour = shift;
 107
 108 if ($flavour =~ /32/) {
 109         $BITS=  32;
 110         $BNSZ=  $BITS/8;
 111         $ISA=   "\"ppc\"";
 112
 113         $LD=    "lwz";          # load
 114         $LDU=   "lwzu";         # load and update
 115         $ST=    "stw";          # store
 116         $STU=   "stwu";         # store and update
 117         $UMULL= "mullw";        # unsigned multiply low
 118         $UMULH= "mulhwu";       # unsigned multiply high
 119         $UDIV=  "divwu";        # unsigned divide
 120         $UCMPI= "cmplwi";       # unsigned compare with immediate
 121         $UCMP=  "cmplw";        # unsigned compare
 122         $CNTLZ= "cntlzw";       # count leading zeros
 123         $SHL=   "slw";          # shift left
 124         $SHR=   "srw";          # unsigned shift right
 125         $SHRI=  "srwi";         # unsigned shift right by immediate
 126         $SHLI=  "slwi";         # shift left by immediate
 127         $CLRU=  "clrlwi";       # clear upper bits
 128         $INSR=  "insrwi";       # insert right
 129         $ROTL=  "rotlwi";       # rotate left by immediate
 130         $TR=    "tw";           # conditional trap
 131 } elsif ($flavour =~ /64/) {
 132         $BITS=  64;
 133         $BNSZ=  $BITS/8;
 134         $ISA=   "\"ppc64\"";
 135
 136         # same as above, but 64-bit mnemonics...
 137         $LD=    "ld";           # load
 138         $LDU=   "ldu";          # load and update
 139         $ST=    "std";          # store
 140         $STU=   "stdu";         # store and update
 141         $UMULL= "mulld";        # unsigned multiply low
 142         $UMULH= "mulhdu";       # unsigned multiply high
 143         $UDIV=  "divdu";        # unsigned divide
 144         $UCMPI= "cmpldi";       # unsigned compare with immediate
 145         $UCMP=  "cmpld";        # unsigned compare
 146         $CNTLZ= "cntlzd";       # count leading zeros
 147         $SHL=   "sld";          # shift left
 148         $SHR=   "srd";          # unsigned shift right
 149         $SHRI=  "srdi";         # unsigned shift right by immediate
 150         $SHLI=  "sldi";         # shift left by immediate
 151         $CLRU=  "clrldi";       # clear upper bits
 152         $INSR=  "insrdi";       # insert right
 153         $ROTL=  "rotldi";       # rotate left by immediate
 154         $TR=    "td";           # conditional trap
 155 } else { die "nonsense $flavour"; }
 156
 157 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 158 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 159 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
 160 die "can't locate ppc-xlate.pl";
 161
 162 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 163
 164 $data=<<EOF;
 165 #--------------------------------------------------------------------
 166 #
 167 #
 168 #
 169 #
 170 #       File:           ppc32.s
 171 #
 172 #       Created by:     Suresh Chari
 173 #                       IBM Thomas J. Watson Research Library
 174 #                       Hawthorne, NY
 175 #
 176 #
 177 #       Description:    Optimized assembly routines for OpenSSL crypto
 178 #                       on the 32 bitPowerPC platform.
 179 #
 180 #
 181 #       Version History
 182 #
 183 #       2. Fixed bn_add,bn_sub and bn_div_words, added comments,
 184 #          cleaned up code. Also made a single version which can
 185 #          be used for both the AIX and Linux compilers. See NOTE
 186 #          below.
 187 #                               12/05/03                Suresh Chari
 188 #                       (with lots of help from)        Andy Polyakov
 189 ##
 190 #       1. Initial version      10/20/02                Suresh Chari
 191 #
 192 #
 193 #       The following file works for the xlc,cc
 194 #       and gcc compilers.
 195 #
 196 #       NOTE:   To get the file to link correctly with the gcc compiler
 197 #               you have to change the names of the routines and remove
 198 #               the first .(dot) character. This should automatically
 199 #               be done in the build process.
 200 #
 201 #       Hand optimized assembly code for the following routines
 202 #
 203 #       bn_sqr_comba4
 204 #       bn_sqr_comba8
 205 #       bn_mul_comba4
 206 #       bn_mul_comba8
 207 #       bn_sub_words
 208 #       bn_add_words
 209 #       bn_div_words
 210 #       bn_sqr_words
 211 #       bn_mul_words
 212 #       bn_mul_add_words
 213 #
 214 #       NOTE:   It is possible to optimize this code more for
 215 #       specific PowerPC or Power architectures. On the Northstar
 216 #       architecture the optimizations in this file do
 217 #        NOT provide much improvement.
 218 #
 219 #       If you have comments or suggestions to improve code send
 220 #       me a note at schari\@us.ibm.com
 221 #
 222 #--------------------------------------------------------------------------
 223 #
 224 #       Defines to be used in the assembly code.
 225 #
 226 #.set r0,0      # we use it as storage for value of 0
 227 #.set SP,1      # preserved
 228 #.set RTOC,2    # preserved
 229 #.set r3,3      # 1st argument/return value
 230 #.set r4,4      # 2nd argument/volatile register
 231 #.set r5,5      # 3rd argument/volatile register
 232 #.set r6,6      # ...
 233 #.set r7,7
 234 #.set r8,8
 235 #.set r9,9
 236 #.set r10,10
 237 #.set r11,11
 238 #.set r12,12
 239 #.set r13,13    # not used, nor any other "below" it...
 240
 241 #       Declare function names to be global
 242 #       NOTE:   For gcc these names MUST be changed to remove
 243 #               the first . i.e. for example change ".bn_sqr_comba4"
 244 #               to "bn_sqr_comba4". This should be automatically done
 245 #               in the build.
 246
 247         .globl  .bn_sqr_comba4
 248         .globl  .bn_sqr_comba8
 249         .globl  .bn_mul_comba4
 250         .globl  .bn_mul_comba8
 251         .globl  .bn_sub_words
 252         .globl  .bn_add_words
 253         .globl  .bn_div_words
 254         .globl  .bn_sqr_words
 255         .globl  .bn_mul_words
 256         .globl  .bn_mul_add_words
 257
 258 # .text section
 259
 260         .machine        "any"
 261
 262 #
 263 #       NOTE:   The following label name should be changed to
 264 #               "bn_sqr_comba4" i.e. remove the first dot
 265 #               for the gcc compiler. This should be automatically
 266 #               done in the build
 267 #
 268
 269 .align  4
 270 .bn_sqr_comba4:
 271 #
 272 # Optimized version of bn_sqr_comba4.
 273 #
 274 # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
 275 # r3 contains r
 276 # r4 contains a
 277 #
 278 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
 279 #
 280 # r5,r6 are the two BN_ULONGs being multiplied.
 281 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
 282 # r9,r10, r11 are the equivalents of c1,c2, c3.
 283 # Here's the assembly
 284 #
 285 #
 286         xor             r0,r0,r0                # set r0 = 0. Used in the addze
 287                                                 # instructions below
 288
 289                                                 #sqr_add_c(a,0,c1,c2,c3)
 290         $LD             r5,`0*$BNSZ`(r4)
 291         $UMULL          r9,r5,r5
 292         $UMULH          r10,r5,r5               #in first iteration. No need
 293                                                 #to add since c1=c2=c3=0.
 294                                                 # Note c3(r11) is NOT set to 0
 295                                                 # but will be.
 296
 297         $ST             r9,`0*$BNSZ`(r3)        # r[0]=c1;
 298                                                 # sqr_add_c2(a,1,0,c2,c3,c1);
 299         $LD             r6,`1*$BNSZ`(r4)
 300         $UMULL          r7,r5,r6
 301         $UMULH          r8,r5,r6
 302
 303         addc            r7,r7,r7                # compute (r7,r8)=2*(r7,r8)
 304         adde            r8,r8,r8
 305         addze           r9,r0                   # catch carry if any.
 306                                                 # r9= r0(=0) and carry
 307
 308         addc            r10,r7,r10              # now add to temp result.
 309         addze           r11,r8                  # r8 added to r11 which is 0
 310         addze           r9,r9
 311
 312         $ST             r10,`1*$BNSZ`(r3)       #r[1]=c2;
 313                                                 #sqr_add_c(a,1,c3,c1,c2)
 314         $UMULL          r7,r6,r6
 315         $UMULH          r8,r6,r6
 316         addc            r11,r7,r11
 317         adde            r9,r8,r9
 318         addze           r10,r0
 319                                                 #sqr_add_c2(a,2,0,c3,c1,c2)
 320         $LD             r6,`2*$BNSZ`(r4)
 321         $UMULL          r7,r5,r6
 322         $UMULH          r8,r5,r6
 323
 324         addc            r7,r7,r7
 325         adde            r8,r8,r8
 326         addze           r10,r10
 327
 328         addc            r11,r7,r11
 329         adde            r9,r8,r9
 330         addze           r10,r10
 331         $ST             r11,`2*$BNSZ`(r3)       #r[2]=c3
 332                                                 #sqr_add_c2(a,3,0,c1,c2,c3);
 333         $LD             r6,`3*$BNSZ`(r4)
 334         $UMULL          r7,r5,r6
 335         $UMULH          r8,r5,r6
 336         addc            r7,r7,r7
 337         adde            r8,r8,r8
 338         addze           r11,r0
 339
 340         addc            r9,r7,r9
 341         adde            r10,r8,r10
 342         addze           r11,r11
 343                                                 #sqr_add_c2(a,2,1,c1,c2,c3);
 344         $LD             r5,`1*$BNSZ`(r4)
 345         $LD             r6,`2*$BNSZ`(r4)
 346         $UMULL          r7,r5,r6
 347         $UMULH          r8,r5,r6
 348
 349         addc            r7,r7,r7
 350         adde            r8,r8,r8
 351         addze           r11,r11
 352         addc            r9,r7,r9
 353         adde            r10,r8,r10
 354         addze           r11,r11
 355         $ST             r9,`3*$BNSZ`(r3)        #r[3]=c1
 356                                                 #sqr_add_c(a,2,c2,c3,c1);
 357         $UMULL          r7,r6,r6
 358         $UMULH          r8,r6,r6
 359         addc            r10,r7,r10
 360         adde            r11,r8,r11
 361         addze           r9,r0
 362                                                 #sqr_add_c2(a,3,1,c2,c3,c1);
 363         $LD             r6,`3*$BNSZ`(r4)
 364         $UMULL          r7,r5,r6
 365         $UMULH          r8,r5,r6
 366         addc            r7,r7,r7
 367         adde            r8,r8,r8
 368         addze           r9,r9
 369
 370         addc            r10,r7,r10
 371         adde            r11,r8,r11
 372         addze           r9,r9
 373         $ST             r10,`4*$BNSZ`(r3)       #r[4]=c2
 374                                                 #sqr_add_c2(a,3,2,c3,c1,c2);
 375         $LD             r5,`2*$BNSZ`(r4)
 376         $UMULL          r7,r5,r6
 377         $UMULH          r8,r5,r6
 378         addc            r7,r7,r7
 379         adde            r8,r8,r8
 380         addze           r10,r0
 381
 382         addc            r11,r7,r11
 383         adde            r9,r8,r9
 384         addze           r10,r10
 385         $ST             r11,`5*$BNSZ`(r3)       #r[5] = c3
 386                                                 #sqr_add_c(a,3,c1,c2,c3);
 387         $UMULL          r7,r6,r6
 388         $UMULH          r8,r6,r6
 389         addc            r9,r7,r9
 390         adde            r10,r8,r10
 391
 392         $ST             r9,`6*$BNSZ`(r3)        #r[6]=c1
 393         $ST             r10,`7*$BNSZ`(r3)       #r[7]=c2
 394         blr
 395         .long   0
 396         .byte   0,12,0x14,0,0,0,2,0
 397         .long   0
 398 .size   .bn_sqr_comba4,.-.bn_sqr_comba4
 399
 400 #
 401 #       NOTE:   The following label name should be changed to
 402 #               "bn_sqr_comba8" i.e. remove the first dot
 403 #               for the gcc compiler. This should be automatically
 404 #               done in the build
 405 #
 406
 407 .align  4
 408 .bn_sqr_comba8:
 409 #
 410 # This is an optimized version of the bn_sqr_comba8 routine.
 411 # Tightly uses the adde instruction
 412 #
 413 #
 414 # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
 415 # r3 contains r
 416 # r4 contains a
 417 #
 418 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
 419 #
 420 # r5,r6 are the two BN_ULONGs being multiplied.
 421 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
 422 # r9,r10, r11 are the equivalents of c1,c2, c3.
 423 #
 424 # Possible optimization of loading all 8 longs of a into registers
 425 # doesn't provide any speedup
 426 #
 427
 428         xor             r0,r0,r0                #set r0 = 0.Used in addze
 429                                                 #instructions below.
 430
 431                                                 #sqr_add_c(a,0,c1,c2,c3);
 432         $LD             r5,`0*$BNSZ`(r4)
 433         $UMULL          r9,r5,r5                #1st iteration: no carries.
 434         $UMULH          r10,r5,r5
 435         $ST             r9,`0*$BNSZ`(r3)        # r[0]=c1;
 436                                                 #sqr_add_c2(a,1,0,c2,c3,c1);
 437         $LD             r6,`1*$BNSZ`(r4)
 438         $UMULL          r7,r5,r6
 439         $UMULH          r8,r5,r6
 440
 441         addc            r10,r7,r10              #add the two register number
 442         adde            r11,r8,r0               # (r8,r7) to the three register
 443         addze           r9,r0                   # number (r9,r11,r10).NOTE:r0=0
 444
 445         addc            r10,r7,r10              #add the two register number
 446         adde            r11,r8,r11              # (r8,r7) to the three register
 447         addze           r9,r9                   # number (r9,r11,r10).
 448
 449         $ST             r10,`1*$BNSZ`(r3)       # r[1]=c2
 450
 451                                                 #sqr_add_c(a,1,c3,c1,c2);
 452         $UMULL          r7,r6,r6
 453         $UMULH          r8,r6,r6
 454         addc            r11,r7,r11
 455         adde            r9,r8,r9
 456         addze           r10,r0
 457                                                 #sqr_add_c2(a,2,0,c3,c1,c2);
 458         $LD             r6,`2*$BNSZ`(r4)
 459         $UMULL          r7,r5,r6
 460         $UMULH          r8,r5,r6
 461
 462         addc            r11,r7,r11
 463         adde            r9,r8,r9
 464         addze           r10,r10
 465
 466         addc            r11,r7,r11
 467         adde            r9,r8,r9
 468         addze           r10,r10
 469
 470         $ST             r11,`2*$BNSZ`(r3)       #r[2]=c3
 471                                                 #sqr_add_c2(a,3,0,c1,c2,c3);
 472         $LD             r6,`3*$BNSZ`(r4)        #r6 = a[3]. r5 is already a[0].
 473         $UMULL          r7,r5,r6
 474         $UMULH          r8,r5,r6
 475
 476         addc            r9,r7,r9
 477         adde            r10,r8,r10
 478         addze           r11,r0
 479
 480         addc            r9,r7,r9
 481         adde            r10,r8,r10
 482         addze           r11,r11
 483                                                 #sqr_add_c2(a,2,1,c1,c2,c3);
 484         $LD             r5,`1*$BNSZ`(r4)
 485         $LD             r6,`2*$BNSZ`(r4)
 486         $UMULL          r7,r5,r6
 487         $UMULH          r8,r5,r6
 488
 489         addc            r9,r7,r9
 490         adde            r10,r8,r10
 491         addze           r11,r11
 492
 493         addc            r9,r7,r9
 494         adde            r10,r8,r10
 495         addze           r11,r11
 496
 497         $ST             r9,`3*$BNSZ`(r3)        #r[3]=c1;
 498                                                 #sqr_add_c(a,2,c2,c3,c1);
 499         $UMULL          r7,r6,r6
 500         $UMULH          r8,r6,r6
 501
 502         addc            r10,r7,r10
 503         adde            r11,r8,r11
 504         addze           r9,r0
 505                                                 #sqr_add_c2(a,3,1,c2,c3,c1);
 506         $LD             r6,`3*$BNSZ`(r4)
 507         $UMULL          r7,r5,r6
 508         $UMULH          r8,r5,r6
 509
 510         addc            r10,r7,r10
 511         adde            r11,r8,r11
 512         addze           r9,r9
 513
 514         addc            r10,r7,r10
 515         adde            r11,r8,r11
 516         addze           r9,r9
 517                                                 #sqr_add_c2(a,4,0,c2,c3,c1);
 518         $LD             r5,`0*$BNSZ`(r4)
 519         $LD             r6,`4*$BNSZ`(r4)
 520         $UMULL          r7,r5,r6
 521         $UMULH          r8,r5,r6
 522
 523         addc            r10,r7,r10
 524         adde            r11,r8,r11
 525         addze           r9,r9
 526
 527         addc            r10,r7,r10
 528         adde            r11,r8,r11
 529         addze           r9,r9
 530         $ST             r10,`4*$BNSZ`(r3)       #r[4]=c2;
 531                                                 #sqr_add_c2(a,5,0,c3,c1,c2);
 532         $LD             r6,`5*$BNSZ`(r4)
 533         $UMULL          r7,r5,r6
 534         $UMULH          r8,r5,r6
 535
 536         addc            r11,r7,r11
 537         adde            r9,r8,r9
 538         addze           r10,r0
 539
 540         addc            r11,r7,r11
 541         adde            r9,r8,r9
 542         addze           r10,r10
 543                                                 #sqr_add_c2(a,4,1,c3,c1,c2);
 544         $LD             r5,`1*$BNSZ`(r4)
 545         $LD             r6,`4*$BNSZ`(r4)
 546         $UMULL          r7,r5,r6
 547         $UMULH          r8,r5,r6
 548
 549         addc            r11,r7,r11
 550         adde            r9,r8,r9
 551         addze           r10,r10
 552
 553         addc            r11,r7,r11
 554         adde            r9,r8,r9
 555         addze           r10,r10
 556                                                 #sqr_add_c2(a,3,2,c3,c1,c2);
 557         $LD             r5,`2*$BNSZ`(r4)
 558         $LD             r6,`3*$BNSZ`(r4)
 559         $UMULL          r7,r5,r6
 560         $UMULH          r8,r5,r6
 561
 562         addc            r11,r7,r11
 563         adde            r9,r8,r9
 564         addze           r10,r10
 565
 566         addc            r11,r7,r11
 567         adde            r9,r8,r9
 568         addze           r10,r10
 569         $ST             r11,`5*$BNSZ`(r3)       #r[5]=c3;
 570                                                 #sqr_add_c(a,3,c1,c2,c3);
 571         $UMULL          r7,r6,r6
 572         $UMULH          r8,r6,r6
 573         addc            r9,r7,r9
 574         adde            r10,r8,r10
 575         addze           r11,r0
 576                                                 #sqr_add_c2(a,4,2,c1,c2,c3);
 577         $LD             r6,`4*$BNSZ`(r4)
 578         $UMULL          r7,r5,r6
 579         $UMULH          r8,r5,r6
 580
 581         addc            r9,r7,r9
 582         adde            r10,r8,r10
 583         addze           r11,r11
 584
 585         addc            r9,r7,r9
 586         adde            r10,r8,r10
 587         addze           r11,r11
 588                                                 #sqr_add_c2(a,5,1,c1,c2,c3);
 589         $LD             r5,`1*$BNSZ`(r4)
 590         $LD             r6,`5*$BNSZ`(r4)
 591         $UMULL          r7,r5,r6
 592         $UMULH          r8,r5,r6
 593
 594         addc            r9,r7,r9
 595         adde            r10,r8,r10
 596         addze           r11,r11
 597
 598         addc            r9,r7,r9
 599         adde            r10,r8,r10
 600         addze           r11,r11
 601                                                 #sqr_add_c2(a,6,0,c1,c2,c3);
 602         $LD             r5,`0*$BNSZ`(r4)
 603         $LD             r6,`6*$BNSZ`(r4)
 604         $UMULL          r7,r5,r6
 605         $UMULH          r8,r5,r6
 606         addc            r9,r7,r9
 607         adde            r10,r8,r10
 608         addze           r11,r11
 609         addc            r9,r7,r9
 610         adde            r10,r8,r10
 611         addze           r11,r11
 612         $ST             r9,`6*$BNSZ`(r3)        #r[6]=c1;
 613                                                 #sqr_add_c2(a,7,0,c2,c3,c1);
 614         $LD             r6,`7*$BNSZ`(r4)
 615         $UMULL          r7,r5,r6
 616         $UMULH          r8,r5,r6
 617
 618         addc            r10,r7,r10
 619         adde            r11,r8,r11
 620         addze           r9,r0
 621         addc            r10,r7,r10
 622         adde            r11,r8,r11
 623         addze           r9,r9
 624                                                 #sqr_add_c2(a,6,1,c2,c3,c1);
 625         $LD             r5,`1*$BNSZ`(r4)
 626         $LD             r6,`6*$BNSZ`(r4)
 627         $UMULL          r7,r5,r6
 628         $UMULH          r8,r5,r6
 629
 630         addc            r10,r7,r10
 631         adde            r11,r8,r11
 632         addze           r9,r9
 633         addc            r10,r7,r10
 634         adde            r11,r8,r11
 635         addze           r9,r9
 636                                                 #sqr_add_c2(a,5,2,c2,c3,c1);
 637         $LD             r5,`2*$BNSZ`(r4)
 638         $LD             r6,`5*$BNSZ`(r4)
 639         $UMULL          r7,r5,r6
 640         $UMULH          r8,r5,r6
 641         addc            r10,r7,r10
 642         adde            r11,r8,r11
 643         addze           r9,r9
 644         addc            r10,r7,r10
 645         adde            r11,r8,r11
 646         addze           r9,r9
 647                                                 #sqr_add_c2(a,4,3,c2,c3,c1);
 648         $LD             r5,`3*$BNSZ`(r4)
 649         $LD             r6,`4*$BNSZ`(r4)
 650         $UMULL          r7,r5,r6
 651         $UMULH          r8,r5,r6
 652
 653         addc            r10,r7,r10
 654         adde            r11,r8,r11
 655         addze           r9,r9
 656         addc            r10,r7,r10
 657         adde            r11,r8,r11
 658         addze           r9,r9
 659         $ST             r10,`7*$BNSZ`(r3)       #r[7]=c2;
 660                                                 #sqr_add_c(a,4,c3,c1,c2);
 661         $UMULL          r7,r6,r6
 662         $UMULH          r8,r6,r6
 663         addc            r11,r7,r11
 664         adde            r9,r8,r9
 665         addze           r10,r0
 666                                                 #sqr_add_c2(a,5,3,c3,c1,c2);
 667         $LD             r6,`5*$BNSZ`(r4)
 668         $UMULL          r7,r5,r6
 669         $UMULH          r8,r5,r6
 670         addc            r11,r7,r11
 671         adde            r9,r8,r9
 672         addze           r10,r10
 673         addc            r11,r7,r11
 674         adde            r9,r8,r9
 675         addze           r10,r10
 676                                                 #sqr_add_c2(a,6,2,c3,c1,c2);
 677         $LD             r5,`2*$BNSZ`(r4)
 678         $LD             r6,`6*$BNSZ`(r4)
 679         $UMULL          r7,r5,r6
 680         $UMULH          r8,r5,r6
 681         addc            r11,r7,r11
 682         adde            r9,r8,r9
 683         addze           r10,r10
 684
 685         addc            r11,r7,r11
 686         adde            r9,r8,r9
 687         addze           r10,r10
 688                                                 #sqr_add_c2(a,7,1,c3,c1,c2);
 689         $LD             r5,`1*$BNSZ`(r4)
 690         $LD             r6,`7*$BNSZ`(r4)
 691         $UMULL          r7,r5,r6
 692         $UMULH          r8,r5,r6
 693         addc            r11,r7,r11
 694         adde            r9,r8,r9
 695         addze           r10,r10
 696         addc            r11,r7,r11
 697         adde            r9,r8,r9
 698         addze           r10,r10
 699         $ST             r11,`8*$BNSZ`(r3)       #r[8]=c3;
 700                                                 #sqr_add_c2(a,7,2,c1,c2,c3);
 701         $LD             r5,`2*$BNSZ`(r4)
 702         $UMULL          r7,r5,r6
 703         $UMULH          r8,r5,r6
 704
 705         addc            r9,r7,r9
 706         adde            r10,r8,r10
 707         addze           r11,r0
 708         addc            r9,r7,r9
 709         adde            r10,r8,r10
 710         addze           r11,r11
 711                                                 #sqr_add_c2(a,6,3,c1,c2,c3);
 712         $LD             r5,`3*$BNSZ`(r4)
 713         $LD             r6,`6*$BNSZ`(r4)
 714         $UMULL          r7,r5,r6
 715         $UMULH          r8,r5,r6
 716         addc            r9,r7,r9
 717         adde            r10,r8,r10
 718         addze           r11,r11
 719         addc            r9,r7,r9
 720         adde            r10,r8,r10
 721         addze           r11,r11
 722                                                 #sqr_add_c2(a,5,4,c1,c2,c3);
 723         $LD             r5,`4*$BNSZ`(r4)
 724         $LD             r6,`5*$BNSZ`(r4)
 725         $UMULL          r7,r5,r6
 726         $UMULH          r8,r5,r6
 727         addc            r9,r7,r9
 728         adde            r10,r8,r10
 729         addze           r11,r11
 730         addc            r9,r7,r9
 731         adde            r10,r8,r10
 732         addze           r11,r11
 733         $ST             r9,`9*$BNSZ`(r3)        #r[9]=c1;
 734                                                 #sqr_add_c(a,5,c2,c3,c1);
 735         $UMULL          r7,r6,r6
 736         $UMULH          r8,r6,r6
 737         addc            r10,r7,r10
 738         adde            r11,r8,r11
 739         addze           r9,r0
 740                                                 #sqr_add_c2(a,6,4,c2,c3,c1);
 741         $LD             r6,`6*$BNSZ`(r4)
 742         $UMULL          r7,r5,r6
 743         $UMULH          r8,r5,r6
 744         addc            r10,r7,r10
 745         adde            r11,r8,r11
 746         addze           r9,r9
 747         addc            r10,r7,r10
 748         adde            r11,r8,r11
 749         addze           r9,r9
 750                                                 #sqr_add_c2(a,7,3,c2,c3,c1);
 751         $LD             r5,`3*$BNSZ`(r4)
 752         $LD             r6,`7*$BNSZ`(r4)
 753         $UMULL          r7,r5,r6
 754         $UMULH          r8,r5,r6
 755         addc            r10,r7,r10
 756         adde            r11,r8,r11
 757         addze           r9,r9
 758         addc            r10,r7,r10
 759         adde            r11,r8,r11
 760         addze           r9,r9
 761         $ST             r10,`10*$BNSZ`(r3)      #r[10]=c2;
 762                                                 #sqr_add_c2(a,7,4,c3,c1,c2);
 763         $LD             r5,`4*$BNSZ`(r4)
 764         $UMULL          r7,r5,r6
 765         $UMULH          r8,r5,r6
 766         addc            r11,r7,r11
 767         adde            r9,r8,r9
 768         addze           r10,r0
 769         addc            r11,r7,r11
 770         adde            r9,r8,r9
 771         addze           r10,r10
 772                                                 #sqr_add_c2(a,6,5,c3,c1,c2);
 773         $LD             r5,`5*$BNSZ`(r4)
 774         $LD             r6,`6*$BNSZ`(r4)
 775         $UMULL          r7,r5,r6
 776         $UMULH          r8,r5,r6
 777         addc            r11,r7,r11
 778         adde            r9,r8,r9
 779         addze           r10,r10
 780         addc            r11,r7,r11
 781         adde            r9,r8,r9
 782         addze           r10,r10
 783         $ST             r11,`11*$BNSZ`(r3)      #r[11]=c3;
 784                                                 #sqr_add_c(a,6,c1,c2,c3);
 785         $UMULL          r7,r6,r6
 786         $UMULH          r8,r6,r6
 787         addc            r9,r7,r9
 788         adde            r10,r8,r10
 789         addze           r11,r0
 790                                                 #sqr_add_c2(a,7,5,c1,c2,c3)
 791         $LD             r6,`7*$BNSZ`(r4)
 792         $UMULL          r7,r5,r6
 793         $UMULH          r8,r5,r6
 794         addc            r9,r7,r9
 795         adde            r10,r8,r10
 796         addze           r11,r11
 797         addc            r9,r7,r9
 798         adde            r10,r8,r10
 799         addze           r11,r11
 800         $ST             r9,`12*$BNSZ`(r3)       #r[12]=c1;
 801
 802                                                 #sqr_add_c2(a,7,6,c2,c3,c1)
 803         $LD             r5,`6*$BNSZ`(r4)
 804         $UMULL          r7,r5,r6
 805         $UMULH          r8,r5,r6
 806         addc            r10,r7,r10
 807         adde            r11,r8,r11
 808         addze           r9,r0
 809         addc            r10,r7,r10
 810         adde            r11,r8,r11
 811         addze           r9,r9
 812         $ST             r10,`13*$BNSZ`(r3)      #r[13]=c2;
 813                                                 #sqr_add_c(a,7,c3,c1,c2);
 814         $UMULL          r7,r6,r6
 815         $UMULH          r8,r6,r6
 816         addc            r11,r7,r11
 817         adde            r9,r8,r9
 818         $ST             r11,`14*$BNSZ`(r3)      #r[14]=c3;
 819         $ST             r9, `15*$BNSZ`(r3)      #r[15]=c1;
 820
 821
 822         blr
 823         .long   0
 824         .byte   0,12,0x14,0,0,0,2,0
 825         .long   0
 826 .size   .bn_sqr_comba8,.-.bn_sqr_comba8
 827
 828 #
 829 #       NOTE:   The following label name should be changed to
 830 #               "bn_mul_comba4" i.e. remove the first dot
 831 #               for the gcc compiler. This should be automatically
 832 #               done in the build
 833 #
 834
 835 .align  4
 836 .bn_mul_comba4:
 837 #
 838 # This is an optimized version of the bn_mul_comba4 routine.
 839 #
 840 # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 841 # r3 contains r
 842 # r4 contains a
 843 # r5 contains b
 844 # r6, r7 are the 2 BN_ULONGs being multiplied.
 845 # r8, r9 are the results of the 32x32 giving 64 multiply.
 846 # r10, r11, r12 are the equivalents of c1, c2, and c3.
 847 #
 848         xor     r0,r0,r0                #r0=0. Used in addze below.
 849                                         #mul_add_c(a[0],b[0],c1,c2,c3);
 850         $LD     r6,`0*$BNSZ`(r4)
 851         $LD     r7,`0*$BNSZ`(r5)
 852         $UMULL  r10,r6,r7
 853         $UMULH  r11,r6,r7
 854         $ST     r10,`0*$BNSZ`(r3)       #r[0]=c1
 855                                         #mul_add_c(a[0],b[1],c2,c3,c1);
 856         $LD     r7,`1*$BNSZ`(r5)
 857         $UMULL  r8,r6,r7
 858         $UMULH  r9,r6,r7
 859         addc    r11,r8,r11
 860         adde    r12,r9,r0
 861         addze   r10,r0
 862                                         #mul_add_c(a[1],b[0],c2,c3,c1);
 863         $LD     r6, `1*$BNSZ`(r4)
 864         $LD     r7, `0*$BNSZ`(r5)
 865         $UMULL  r8,r6,r7
 866         $UMULH  r9,r6,r7
 867         addc    r11,r8,r11
 868         adde    r12,r9,r12
 869         addze   r10,r10
 870         $ST     r11,`1*$BNSZ`(r3)       #r[1]=c2
 871                                         #mul_add_c(a[2],b[0],c3,c1,c2);
 872         $LD     r6,`2*$BNSZ`(r4)
 873         $UMULL  r8,r6,r7
 874         $UMULH  r9,r6,r7
 875         addc    r12,r8,r12
 876         adde    r10,r9,r10
 877         addze   r11,r0
 878                                         #mul_add_c(a[1],b[1],c3,c1,c2);
 879         $LD     r6,`1*$BNSZ`(r4)
 880         $LD     r7,`1*$BNSZ`(r5)
 881         $UMULL  r8,r6,r7
 882         $UMULH  r9,r6,r7
 883         addc    r12,r8,r12
 884         adde    r10,r9,r10
 885         addze   r11,r11
 886                                         #mul_add_c(a[0],b[2],c3,c1,c2);
 887         $LD     r6,`0*$BNSZ`(r4)
 888         $LD     r7,`2*$BNSZ`(r5)
 889         $UMULL  r8,r6,r7
 890         $UMULH  r9,r6,r7
 891         addc    r12,r8,r12
 892         adde    r10,r9,r10
 893         addze   r11,r11
 894         $ST     r12,`2*$BNSZ`(r3)       #r[2]=c3
 895                                         #mul_add_c(a[0],b[3],c1,c2,c3);
 896         $LD     r7,`3*$BNSZ`(r5)
 897         $UMULL  r8,r6,r7
 898         $UMULH  r9,r6,r7
 899         addc    r10,r8,r10
 900         adde    r11,r9,r11
 901         addze   r12,r0
 902                                         #mul_add_c(a[1],b[2],c1,c2,c3);
 903         $LD     r6,`1*$BNSZ`(r4)
 904         $LD     r7,`2*$BNSZ`(r5)
 905         $UMULL  r8,r6,r7
 906         $UMULH  r9,r6,r7
 907         addc    r10,r8,r10
 908         adde    r11,r9,r11
 909         addze   r12,r12
 910                                         #mul_add_c(a[2],b[1],c1,c2,c3);
 911         $LD     r6,`2*$BNSZ`(r4)
 912         $LD     r7,`1*$BNSZ`(r5)
 913         $UMULL  r8,r6,r7
 914         $UMULH  r9,r6,r7
 915         addc    r10,r8,r10
 916         adde    r11,r9,r11
 917         addze   r12,r12
 918                                         #mul_add_c(a[3],b[0],c1,c2,c3);
 919         $LD     r6,`3*$BNSZ`(r4)
 920         $LD     r7,`0*$BNSZ`(r5)
 921         $UMULL  r8,r6,r7
 922         $UMULH  r9,r6,r7
 923         addc    r10,r8,r10
 924         adde    r11,r9,r11
 925         addze   r12,r12
 926         $ST     r10,`3*$BNSZ`(r3)       #r[3]=c1
 927                                         #mul_add_c(a[3],b[1],c2,c3,c1);
 928         $LD     r7,`1*$BNSZ`(r5)
 929         $UMULL  r8,r6,r7
 930         $UMULH  r9,r6,r7
 931         addc    r11,r8,r11
 932         adde    r12,r9,r12
 933         addze   r10,r0
 934                                         #mul_add_c(a[2],b[2],c2,c3,c1);
 935         $LD     r6,`2*$BNSZ`(r4)
 936         $LD     r7,`2*$BNSZ`(r5)
 937         $UMULL  r8,r6,r7
 938         $UMULH  r9,r6,r7
 939         addc    r11,r8,r11
 940         adde    r12,r9,r12
 941         addze   r10,r10
 942                                         #mul_add_c(a[1],b[3],c2,c3,c1);
 943         $LD     r6,`1*$BNSZ`(r4)
 944         $LD     r7,`3*$BNSZ`(r5)
 945         $UMULL  r8,r6,r7
 946         $UMULH  r9,r6,r7
 947         addc    r11,r8,r11
 948         adde    r12,r9,r12
 949         addze   r10,r10
 950         $ST     r11,`4*$BNSZ`(r3)       #r[4]=c2
 951                                         #mul_add_c(a[2],b[3],c3,c1,c2);
 952         $LD     r6,`2*$BNSZ`(r4)
 953         $UMULL  r8,r6,r7
 954         $UMULH  r9,r6,r7
 955         addc    r12,r8,r12
 956         adde    r10,r9,r10
 957         addze   r11,r0
 958                                         #mul_add_c(a[3],b[2],c3,c1,c2);
 959         $LD     r6,`3*$BNSZ`(r4)
 960         $LD     r7,`2*$BNSZ`(r5)
 961         $UMULL  r8,r6,r7
 962         $UMULH  r9,r6,r7
 963         addc    r12,r8,r12
 964         adde    r10,r9,r10
 965         addze   r11,r11
 966         $ST     r12,`5*$BNSZ`(r3)       #r[5]=c3
 967                                         #mul_add_c(a[3],b[3],c1,c2,c3);
 968         $LD     r7,`3*$BNSZ`(r5)
 969         $UMULL  r8,r6,r7
 970         $UMULH  r9,r6,r7
 971         addc    r10,r8,r10
 972         adde    r11,r9,r11
 973
 974         $ST     r10,`6*$BNSZ`(r3)       #r[6]=c1
 975         $ST     r11,`7*$BNSZ`(r3)       #r[7]=c2
 976         blr
 977         .long   0
 978         .byte   0,12,0x14,0,0,0,3,0
 979         .long   0
 980 .size   .bn_mul_comba4,.-.bn_mul_comba4
 981
 982 #
 983 #       NOTE:   The following label name should be changed to
 984 #               "bn_mul_comba8" i.e. remove the first dot
 985 #               for the gcc compiler. This should be automatically
 986 #               done in the build
 987 #
 988
 989 .align  4
 990 .bn_mul_comba8:
 991 #
 992 # Optimized version of the bn_mul_comba8 routine.
 993 #
 994 # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 995 # r3 contains r
 996 # r4 contains a
 997 # r5 contains b
 998 # r6, r7 are the 2 BN_ULONGs being multiplied.
 999 # r8, r9 are the results of the 32x32 giving 64 multiply.
1000 # r10, r11, r12 are the equivalents of c1, c2, and c3.
1001 #
1002         xor     r0,r0,r0                #r0=0. Used in addze below.
1003
1004                                         #mul_add_c(a[0],b[0],c1,c2,c3);
1005         $LD     r6,`0*$BNSZ`(r4)        #a[0]
1006         $LD     r7,`0*$BNSZ`(r5)        #b[0]
1007         $UMULL  r10,r6,r7
1008         $UMULH  r11,r6,r7
1009         $ST     r10,`0*$BNSZ`(r3)       #r[0]=c1;
1010                                         #mul_add_c(a[0],b[1],c2,c3,c1);
1011         $LD     r7,`1*$BNSZ`(r5)
1012         $UMULL  r8,r6,r7
1013         $UMULH  r9,r6,r7
1014         addc    r11,r11,r8
1015         addze   r12,r9                  # since we didn't set r12 to zero before.
1016         addze   r10,r0
1017                                         #mul_add_c(a[1],b[0],c2,c3,c1);
1018         $LD     r6,`1*$BNSZ`(r4)
1019         $LD     r7,`0*$BNSZ`(r5)
1020         $UMULL  r8,r6,r7
1021         $UMULH  r9,r6,r7
1022         addc    r11,r11,r8
1023         adde    r12,r12,r9
1024         addze   r10,r10
1025         $ST     r11,`1*$BNSZ`(r3)       #r[1]=c2;
1026                                         #mul_add_c(a[2],b[0],c3,c1,c2);
1027         $LD     r6,`2*$BNSZ`(r4)
1028         $UMULL  r8,r6,r7
1029         $UMULH  r9,r6,r7
1030         addc    r12,r12,r8
1031         adde    r10,r10,r9
1032         addze   r11,r0
1033                                         #mul_add_c(a[1],b[1],c3,c1,c2);
1034         $LD     r6,`1*$BNSZ`(r4)
1035         $LD     r7,`1*$BNSZ`(r5)
1036         $UMULL  r8,r6,r7
1037         $UMULH  r9,r6,r7
1038         addc    r12,r12,r8
1039         adde    r10,r10,r9
1040         addze   r11,r11
1041                                         #mul_add_c(a[0],b[2],c3,c1,c2);
1042         $LD     r6,`0*$BNSZ`(r4)
1043         $LD     r7,`2*$BNSZ`(r5)
1044         $UMULL  r8,r6,r7
1045         $UMULH  r9,r6,r7
1046         addc    r12,r12,r8
1047         adde    r10,r10,r9
1048         addze   r11,r11
1049         $ST     r12,`2*$BNSZ`(r3)       #r[2]=c3;
1050                                         #mul_add_c(a[0],b[3],c1,c2,c3);
1051         $LD     r7,`3*$BNSZ`(r5)
1052         $UMULL  r8,r6,r7
1053         $UMULH  r9,r6,r7
1054         addc    r10,r10,r8
1055         adde    r11,r11,r9
1056         addze   r12,r0
1057                                         #mul_add_c(a[1],b[2],c1,c2,c3);
1058         $LD     r6,`1*$BNSZ`(r4)
1059         $LD     r7,`2*$BNSZ`(r5)
1060         $UMULL  r8,r6,r7
1061         $UMULH  r9,r6,r7
1062         addc    r10,r10,r8
1063         adde    r11,r11,r9
1064         addze   r12,r12
1065
1066                                         #mul_add_c(a[2],b[1],c1,c2,c3);
1067         $LD     r6,`2*$BNSZ`(r4)
1068         $LD     r7,`1*$BNSZ`(r5)
1069         $UMULL  r8,r6,r7
1070         $UMULH  r9,r6,r7
1071         addc    r10,r10,r8
1072         adde    r11,r11,r9
1073         addze   r12,r12
1074                                         #mul_add_c(a[3],b[0],c1,c2,c3);
1075         $LD     r6,`3*$BNSZ`(r4)
1076         $LD     r7,`0*$BNSZ`(r5)
1077         $UMULL  r8,r6,r7
1078         $UMULH  r9,r6,r7
1079         addc    r10,r10,r8
1080         adde    r11,r11,r9
1081         addze   r12,r12
1082         $ST     r10,`3*$BNSZ`(r3)       #r[3]=c1;
1083                                         #mul_add_c(a[4],b[0],c2,c3,c1);
1084         $LD     r6,`4*$BNSZ`(r4)
1085         $UMULL  r8,r6,r7
1086         $UMULH  r9,r6,r7
1087         addc    r11,r11,r8
1088         adde    r12,r12,r9
1089         addze   r10,r0
1090                                         #mul_add_c(a[3],b[1],c2,c3,c1);
1091         $LD     r6,`3*$BNSZ`(r4)
1092         $LD     r7,`1*$BNSZ`(r5)
1093         $UMULL  r8,r6,r7
1094         $UMULH  r9,r6,r7
1095         addc    r11,r11,r8
1096         adde    r12,r12,r9
1097         addze   r10,r10
1098                                         #mul_add_c(a[2],b[2],c2,c3,c1);
1099         $LD     r6,`2*$BNSZ`(r4)
1100         $LD     r7,`2*$BNSZ`(r5)
1101         $UMULL  r8,r6,r7
1102         $UMULH  r9,r6,r7
1103         addc    r11,r11,r8
1104         adde    r12,r12,r9
1105         addze   r10,r10
1106                                         #mul_add_c(a[1],b[3],c2,c3,c1);
1107         $LD     r6,`1*$BNSZ`(r4)
1108         $LD     r7,`3*$BNSZ`(r5)
1109         $UMULL  r8,r6,r7
1110         $UMULH  r9,r6,r7
1111         addc    r11,r11,r8
1112         adde    r12,r12,r9
1113         addze   r10,r10
1114                                         #mul_add_c(a[0],b[4],c2,c3,c1);
1115         $LD     r6,`0*$BNSZ`(r4)
1116         $LD     r7,`4*$BNSZ`(r5)
1117         $UMULL  r8,r6,r7
1118         $UMULH  r9,r6,r7
1119         addc    r11,r11,r8
1120         adde    r12,r12,r9
1121         addze   r10,r10
1122         $ST     r11,`4*$BNSZ`(r3)       #r[4]=c2;
1123                                         #mul_add_c(a[0],b[5],c3,c1,c2);
1124         $LD     r7,`5*$BNSZ`(r5)
1125         $UMULL  r8,r6,r7
1126         $UMULH  r9,r6,r7
1127         addc    r12,r12,r8
1128         adde    r10,r10,r9
1129         addze   r11,r0
1130                                         #mul_add_c(a[1],b[4],c3,c1,c2);
1131         $LD     r6,`1*$BNSZ`(r4)
1132         $LD     r7,`4*$BNSZ`(r5)
1133         $UMULL  r8,r6,r7
1134         $UMULH  r9,r6,r7
1135         addc    r12,r12,r8
1136         adde    r10,r10,r9
1137         addze   r11,r11
1138                                         #mul_add_c(a[2],b[3],c3,c1,c2);
1139         $LD     r6,`2*$BNSZ`(r4)
1140         $LD     r7,`3*$BNSZ`(r5)
1141         $UMULL  r8,r6,r7
1142         $UMULH  r9,r6,r7
1143         addc    r12,r12,r8
1144         adde    r10,r10,r9
1145         addze   r11,r11
1146                                         #mul_add_c(a[3],b[2],c3,c1,c2);
1147         $LD     r6,`3*$BNSZ`(r4)
1148         $LD     r7,`2*$BNSZ`(r5)
1149         $UMULL  r8,r6,r7
1150         $UMULH  r9,r6,r7
1151         addc    r12,r12,r8
1152         adde    r10,r10,r9
1153         addze   r11,r11
1154                                         #mul_add_c(a[4],b[1],c3,c1,c2);
1155         $LD     r6,`4*$BNSZ`(r4)
1156         $LD     r7,`1*$BNSZ`(r5)
1157         $UMULL  r8,r6,r7
1158         $UMULH  r9,r6,r7
1159         addc    r12,r12,r8
1160         adde    r10,r10,r9
1161         addze   r11,r11
1162                                         #mul_add_c(a[5],b[0],c3,c1,c2);
1163         $LD     r6,`5*$BNSZ`(r4)
1164         $LD     r7,`0*$BNSZ`(r5)
1165         $UMULL  r8,r6,r7
1166         $UMULH  r9,r6,r7
1167         addc    r12,r12,r8
1168         adde    r10,r10,r9
1169         addze   r11,r11
1170         $ST     r12,`5*$BNSZ`(r3)       #r[5]=c3;
1171                                         #mul_add_c(a[6],b[0],c1,c2,c3);
1172         $LD     r6,`6*$BNSZ`(r4)
1173         $UMULL  r8,r6,r7
1174         $UMULH  r9,r6,r7
1175         addc    r10,r10,r8
1176         adde    r11,r11,r9
1177         addze   r12,r0
1178                                         #mul_add_c(a[5],b[1],c1,c2,c3);
1179         $LD     r6,`5*$BNSZ`(r4)
1180         $LD     r7,`1*$BNSZ`(r5)
1181         $UMULL  r8,r6,r7
1182         $UMULH  r9,r6,r7
1183         addc    r10,r10,r8
1184         adde    r11,r11,r9
1185         addze   r12,r12
1186                                         #mul_add_c(a[4],b[2],c1,c2,c3);
1187         $LD     r6,`4*$BNSZ`(r4)
1188         $LD     r7,`2*$BNSZ`(r5)
1189         $UMULL  r8,r6,r7
1190         $UMULH  r9,r6,r7
1191         addc    r10,r10,r8
1192         adde    r11,r11,r9
1193         addze   r12,r12
1194                                         #mul_add_c(a[3],b[3],c1,c2,c3);
1195         $LD     r6,`3*$BNSZ`(r4)
1196         $LD     r7,`3*$BNSZ`(r5)
1197         $UMULL  r8,r6,r7
1198         $UMULH  r9,r6,r7
1199         addc    r10,r10,r8
1200         adde    r11,r11,r9
1201         addze   r12,r12
1202                                         #mul_add_c(a[2],b[4],c1,c2,c3);
1203         $LD     r6,`2*$BNSZ`(r4)
1204         $LD     r7,`4*$BNSZ`(r5)
1205         $UMULL  r8,r6,r7
1206         $UMULH  r9,r6,r7
1207         addc    r10,r10,r8
1208         adde    r11,r11,r9
1209         addze   r12,r12
1210                                         #mul_add_c(a[1],b[5],c1,c2,c3);
1211         $LD     r6,`1*$BNSZ`(r4)
1212         $LD     r7,`5*$BNSZ`(r5)
1213         $UMULL  r8,r6,r7
1214         $UMULH  r9,r6,r7
1215         addc    r10,r10,r8
1216         adde    r11,r11,r9
1217         addze   r12,r12
1218                                         #mul_add_c(a[0],b[6],c1,c2,c3);
1219         $LD     r6,`0*$BNSZ`(r4)
1220         $LD     r7,`6*$BNSZ`(r5)
1221         $UMULL  r8,r6,r7
1222         $UMULH  r9,r6,r7
1223         addc    r10,r10,r8
1224         adde    r11,r11,r9
1225         addze   r12,r12
1226         $ST     r10,`6*$BNSZ`(r3)       #r[6]=c1;
1227                                         #mul_add_c(a[0],b[7],c2,c3,c1);
1228         $LD     r7,`7*$BNSZ`(r5)
1229         $UMULL  r8,r6,r7
1230         $UMULH  r9,r6,r7
1231         addc    r11,r11,r8
1232         adde    r12,r12,r9
1233         addze   r10,r0
1234                                         #mul_add_c(a[1],b[6],c2,c3,c1);
1235         $LD     r6,`1*$BNSZ`(r4)
1236         $LD     r7,`6*$BNSZ`(r5)
1237         $UMULL  r8,r6,r7
1238         $UMULH  r9,r6,r7
1239         addc    r11,r11,r8
1240         adde    r12,r12,r9
1241         addze   r10,r10
1242                                         #mul_add_c(a[2],b[5],c2,c3,c1);
1243         $LD     r6,`2*$BNSZ`(r4)
1244         $LD     r7,`5*$BNSZ`(r5)
1245         $UMULL  r8,r6,r7
1246         $UMULH  r9,r6,r7
1247         addc    r11,r11,r8
1248         adde    r12,r12,r9
1249         addze   r10,r10
1250                                         #mul_add_c(a[3],b[4],c2,c3,c1);
1251         $LD     r6,`3*$BNSZ`(r4)
1252         $LD     r7,`4*$BNSZ`(r5)
1253         $UMULL  r8,r6,r7
1254         $UMULH  r9,r6,r7
1255         addc    r11,r11,r8
1256         adde    r12,r12,r9
1257         addze   r10,r10
1258                                         #mul_add_c(a[4],b[3],c2,c3,c1);
1259         $LD     r6,`4*$BNSZ`(r4)
1260         $LD     r7,`3*$BNSZ`(r5)
1261         $UMULL  r8,r6,r7
1262         $UMULH  r9,r6,r7
1263         addc    r11,r11,r8
1264         adde    r12,r12,r9
1265         addze   r10,r10
1266                                         #mul_add_c(a[5],b[2],c2,c3,c1);
1267         $LD     r6,`5*$BNSZ`(r4)
1268         $LD     r7,`2*$BNSZ`(r5)
1269         $UMULL  r8,r6,r7
1270         $UMULH  r9,r6,r7
1271         addc    r11,r11,r8
1272         adde    r12,r12,r9
1273         addze   r10,r10
1274                                         #mul_add_c(a[6],b[1],c2,c3,c1);
1275         $LD     r6,`6*$BNSZ`(r4)
1276         $LD     r7,`1*$BNSZ`(r5)
1277         $UMULL  r8,r6,r7
1278         $UMULH  r9,r6,r7
1279         addc    r11,r11,r8
1280         adde    r12,r12,r9
1281         addze   r10,r10
1282                                         #mul_add_c(a[7],b[0],c2,c3,c1);
1283         $LD     r6,`7*$BNSZ`(r4)
1284         $LD     r7,`0*$BNSZ`(r5)
1285         $UMULL  r8,r6,r7
1286         $UMULH  r9,r6,r7
1287         addc    r11,r11,r8
1288         adde    r12,r12,r9
1289         addze   r10,r10
1290         $ST     r11,`7*$BNSZ`(r3)       #r[7]=c2;
1291                                         #mul_add_c(a[7],b[1],c3,c1,c2);
1292         $LD     r7,`1*$BNSZ`(r5)
1293         $UMULL  r8,r6,r7
1294         $UMULH  r9,r6,r7
1295         addc    r12,r12,r8
1296         adde    r10,r10,r9
1297         addze   r11,r0
1298                                         #mul_add_c(a[6],b[2],c3,c1,c2);
1299         $LD     r6,`6*$BNSZ`(r4)
1300         $LD     r7,`2*$BNSZ`(r5)
1301         $UMULL  r8,r6,r7
1302         $UMULH  r9,r6,r7
1303         addc    r12,r12,r8
1304         adde    r10,r10,r9
1305         addze   r11,r11
1306                                         #mul_add_c(a[5],b[3],c3,c1,c2);
1307         $LD     r6,`5*$BNSZ`(r4)
1308         $LD     r7,`3*$BNSZ`(r5)
1309         $UMULL  r8,r6,r7
1310         $UMULH  r9,r6,r7
1311         addc    r12,r12,r8
1312         adde    r10,r10,r9
1313         addze   r11,r11
1314                                         #mul_add_c(a[4],b[4],c3,c1,c2);
1315         $LD     r6,`4*$BNSZ`(r4)
1316         $LD     r7,`4*$BNSZ`(r5)
1317         $UMULL  r8,r6,r7
1318         $UMULH  r9,r6,r7
1319         addc    r12,r12,r8
1320         adde    r10,r10,r9
1321         addze   r11,r11
1322                                         #mul_add_c(a[3],b[5],c3,c1,c2);
1323         $LD     r6,`3*$BNSZ`(r4)
1324         $LD     r7,`5*$BNSZ`(r5)
1325         $UMULL  r8,r6,r7
1326         $UMULH  r9,r6,r7
1327         addc    r12,r12,r8
1328         adde    r10,r10,r9
1329         addze   r11,r11
1330                                         #mul_add_c(a[2],b[6],c3,c1,c2);
1331         $LD     r6,`2*$BNSZ`(r4)
1332         $LD     r7,`6*$BNSZ`(r5)
1333         $UMULL  r8,r6,r7
1334         $UMULH  r9,r6,r7
1335         addc    r12,r12,r8
1336         adde    r10,r10,r9
1337         addze   r11,r11
1338                                         #mul_add_c(a[1],b[7],c3,c1,c2);
1339         $LD     r6,`1*$BNSZ`(r4)
1340         $LD     r7,`7*$BNSZ`(r5)
1341         $UMULL  r8,r6,r7
1342         $UMULH  r9,r6,r7
1343         addc    r12,r12,r8
1344         adde    r10,r10,r9
1345         addze   r11,r11
1346         $ST     r12,`8*$BNSZ`(r3)       #r[8]=c3;
1347                                         #mul_add_c(a[2],b[7],c1,c2,c3);
1348         $LD     r6,`2*$BNSZ`(r4)
1349         $UMULL  r8,r6,r7
1350         $UMULH  r9,r6,r7
1351         addc    r10,r10,r8
1352         adde    r11,r11,r9
1353         addze   r12,r0
1354                                         #mul_add_c(a[3],b[6],c1,c2,c3);
1355         $LD     r6,`3*$BNSZ`(r4)
1356         $LD     r7,`6*$BNSZ`(r5)
1357         $UMULL  r8,r6,r7
1358         $UMULH  r9,r6,r7
1359         addc    r10,r10,r8
1360         adde    r11,r11,r9
1361         addze   r12,r12
1362                                         #mul_add_c(a[4],b[5],c1,c2,c3);
1363         $LD     r6,`4*$BNSZ`(r4)
1364         $LD     r7,`5*$BNSZ`(r5)
1365         $UMULL  r8,r6,r7
1366         $UMULH  r9,r6,r7
1367         addc    r10,r10,r8
1368         adde    r11,r11,r9
1369         addze   r12,r12
1370                                         #mul_add_c(a[5],b[4],c1,c2,c3);
1371         $LD     r6,`5*$BNSZ`(r4)
1372         $LD     r7,`4*$BNSZ`(r5)
1373         $UMULL  r8,r6,r7
1374         $UMULH  r9,r6,r7
1375         addc    r10,r10,r8
1376         adde    r11,r11,r9
1377         addze   r12,r12
1378                                         #mul_add_c(a[6],b[3],c1,c2,c3);
1379         $LD     r6,`6*$BNSZ`(r4)
1380         $LD     r7,`3*$BNSZ`(r5)
1381         $UMULL  r8,r6,r7
1382         $UMULH  r9,r6,r7
1383         addc    r10,r10,r8
1384         adde    r11,r11,r9
1385         addze   r12,r12
1386                                         #mul_add_c(a[7],b[2],c1,c2,c3);
1387         $LD     r6,`7*$BNSZ`(r4)
1388         $LD     r7,`2*$BNSZ`(r5)
1389         $UMULL  r8,r6,r7
1390         $UMULH  r9,r6,r7
1391         addc    r10,r10,r8
1392         adde    r11,r11,r9
1393         addze   r12,r12
1394         $ST     r10,`9*$BNSZ`(r3)       #r[9]=c1;
1395                                         #mul_add_c(a[7],b[3],c2,c3,c1);
1396         $LD     r7,`3*$BNSZ`(r5)
1397         $UMULL  r8,r6,r7
1398         $UMULH  r9,r6,r7
1399         addc    r11,r11,r8
1400         adde    r12,r12,r9
1401         addze   r10,r0
1402                                         #mul_add_c(a[6],b[4],c2,c3,c1);
1403         $LD     r6,`6*$BNSZ`(r4)
1404         $LD     r7,`4*$BNSZ`(r5)
1405         $UMULL  r8,r6,r7
1406         $UMULH  r9,r6,r7
1407         addc    r11,r11,r8
1408         adde    r12,r12,r9
1409         addze   r10,r10
1410                                         #mul_add_c(a[5],b[5],c2,c3,c1);
1411         $LD     r6,`5*$BNSZ`(r4)
1412         $LD     r7,`5*$BNSZ`(r5)
1413         $UMULL  r8,r6,r7
1414         $UMULH  r9,r6,r7
1415         addc    r11,r11,r8
1416         adde    r12,r12,r9
1417         addze   r10,r10
1418                                         #mul_add_c(a[4],b[6],c2,c3,c1);
1419         $LD     r6,`4*$BNSZ`(r4)
1420         $LD     r7,`6*$BNSZ`(r5)
1421         $UMULL  r8,r6,r7
1422         $UMULH  r9,r6,r7
1423         addc    r11,r11,r8
1424         adde    r12,r12,r9
1425         addze   r10,r10
1426                                         #mul_add_c(a[3],b[7],c2,c3,c1);
1427         $LD     r6,`3*$BNSZ`(r4)
1428         $LD     r7,`7*$BNSZ`(r5)
1429         $UMULL  r8,r6,r7
1430         $UMULH  r9,r6,r7
1431         addc    r11,r11,r8
1432         adde    r12,r12,r9
1433         addze   r10,r10
1434         $ST     r11,`10*$BNSZ`(r3)      #r[10]=c2;
1435                                         #mul_add_c(a[4],b[7],c3,c1,c2);
1436         $LD     r6,`4*$BNSZ`(r4)
1437         $UMULL  r8,r6,r7
1438         $UMULH  r9,r6,r7
1439         addc    r12,r12,r8
1440         adde    r10,r10,r9
1441         addze   r11,r0
1442                                         #mul_add_c(a[5],b[6],c3,c1,c2);
1443         $LD     r6,`5*$BNSZ`(r4)
1444         $LD     r7,`6*$BNSZ`(r5)
1445         $UMULL  r8,r6,r7
1446         $UMULH  r9,r6,r7
1447         addc    r12,r12,r8
1448         adde    r10,r10,r9
1449         addze   r11,r11
1450                                         #mul_add_c(a[6],b[5],c3,c1,c2);
1451         $LD     r6,`6*$BNSZ`(r4)
1452         $LD     r7,`5*$BNSZ`(r5)
1453         $UMULL  r8,r6,r7
1454         $UMULH  r9,r6,r7
1455         addc    r12,r12,r8
1456         adde    r10,r10,r9
1457         addze   r11,r11
1458                                         #mul_add_c(a[7],b[4],c3,c1,c2);
1459         $LD     r6,`7*$BNSZ`(r4)
1460         $LD     r7,`4*$BNSZ`(r5)
1461         $UMULL  r8,r6,r7
1462         $UMULH  r9,r6,r7
1463         addc    r12,r12,r8
1464         adde    r10,r10,r9
1465         addze   r11,r11
1466         $ST     r12,`11*$BNSZ`(r3)      #r[11]=c3;
1467                                         #mul_add_c(a[7],b[5],c1,c2,c3);
1468         $LD     r7,`5*$BNSZ`(r5)
1469         $UMULL  r8,r6,r7
1470         $UMULH  r9,r6,r7
1471         addc    r10,r10,r8
1472         adde    r11,r11,r9
1473         addze   r12,r0
1474                                         #mul_add_c(a[6],b[6],c1,c2,c3);
1475         $LD     r6,`6*$BNSZ`(r4)
1476         $LD     r7,`6*$BNSZ`(r5)
1477         $UMULL  r8,r6,r7
1478         $UMULH  r9,r6,r7
1479         addc    r10,r10,r8
1480         adde    r11,r11,r9
1481         addze   r12,r12
1482                                         #mul_add_c(a[5],b[7],c1,c2,c3);
1483         $LD     r6,`5*$BNSZ`(r4)
1484         $LD     r7,`7*$BNSZ`(r5)
1485         $UMULL  r8,r6,r7
1486         $UMULH  r9,r6,r7
1487         addc    r10,r10,r8
1488         adde    r11,r11,r9
1489         addze   r12,r12
1490         $ST     r10,`12*$BNSZ`(r3)      #r[12]=c1;
1491                                         #mul_add_c(a[6],b[7],c2,c3,c1);
1492         $LD     r6,`6*$BNSZ`(r4)
1493         $UMULL  r8,r6,r7
1494         $UMULH  r9,r6,r7
1495         addc    r11,r11,r8
1496         adde    r12,r12,r9
1497         addze   r10,r0
1498                                         #mul_add_c(a[7],b[6],c2,c3,c1);
1499         $LD     r6,`7*$BNSZ`(r4)
1500         $LD     r7,`6*$BNSZ`(r5)
1501         $UMULL  r8,r6,r7
1502         $UMULH  r9,r6,r7
1503         addc    r11,r11,r8
1504         adde    r12,r12,r9
1505         addze   r10,r10
1506         $ST     r11,`13*$BNSZ`(r3)      #r[13]=c2;
1507                                         #mul_add_c(a[7],b[7],c3,c1,c2);
1508         $LD     r7,`7*$BNSZ`(r5)
1509         $UMULL  r8,r6,r7
1510         $UMULH  r9,r6,r7
1511         addc    r12,r12,r8
1512         adde    r10,r10,r9
1513         $ST     r12,`14*$BNSZ`(r3)      #r[14]=c3;
1514         $ST     r10,`15*$BNSZ`(r3)      #r[15]=c1;
1515         blr
1516         .long   0
1517         .byte   0,12,0x14,0,0,0,3,0
1518         .long   0
1519 .size   .bn_mul_comba8,.-.bn_mul_comba8
1520
1521 #
1522 #       NOTE:   The following label name should be changed to
1523 #               "bn_sub_words" i.e. remove the first dot
1524 #               for the gcc compiler. This should be automatically
1525 #               done in the build
1526 #
1527 #
1528 .align  4
1529 .bn_sub_words:
1530 #
1531 #       Handcoded version of bn_sub_words
1532 #
1533 #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1534 #
1535 #       r3 = r
1536 #       r4 = a
1537 #       r5 = b
1538 #       r6 = n
1539 #
1540 #       Note:   No loop unrolling done since this is not a performance
1541 #               critical loop.
1542
1543         xor     r0,r0,r0        #set r0 = 0
1544 #
1545 #       check for r6 = 0 AND set carry bit.
1546 #
1547         subfc.  r7,r0,r6        # If r6 is 0 then result is 0.
1548                                 # if r6 > 0 then result !=0
1549                                 # In either case carry bit is set.
1550         beq     Lppcasm_sub_adios
1551         addi    r4,r4,-$BNSZ
1552         addi    r3,r3,-$BNSZ
1553         addi    r5,r5,-$BNSZ
1554         mtctr   r6
1555 Lppcasm_sub_mainloop:
1556         $LDU    r7,$BNSZ(r4)
1557         $LDU    r8,$BNSZ(r5)
1558         subfe   r6,r8,r7        # r6 = r7+carry bit + onescomplement(r8)
1559                                 # if carry = 1 this is r7-r8. Else it
1560                                 # is r7-r8 -1 as we need.
1561         $STU    r6,$BNSZ(r3)
1562         bdnz    Lppcasm_sub_mainloop
1563 Lppcasm_sub_adios:
1564         subfze  r3,r0           # if carry bit is set then r3 = 0 else -1
1565         andi.   r3,r3,1         # keep only last bit.
1566         blr
1567         .long   0
1568         .byte   0,12,0x14,0,0,0,4,0
1569         .long   0
1570 .size   .bn_sub_words,.-.bn_sub_words
1571
1572 #
1573 #       NOTE:   The following label name should be changed to
1574 #               "bn_add_words" i.e. remove the first dot
1575 #               for the gcc compiler. This should be automatically
1576 #               done in the build
1577 #
1578
1579 .align  4
1580 .bn_add_words:
1581 #
1582 #       Handcoded version of bn_add_words
1583 #
1584 #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1585 #
1586 #       r3 = r
1587 #       r4 = a
1588 #       r5 = b
1589 #       r6 = n
1590 #
1591 #       Note:   No loop unrolling done since this is not a performance
1592 #               critical loop.
1593
1594         xor     r0,r0,r0
1595 #
1596 #       check for r6 = 0. Is this needed?
1597 #
1598         addic.  r6,r6,0         #test r6 and clear carry bit.
1599         beq     Lppcasm_add_adios
1600         addi    r4,r4,-$BNSZ
1601         addi    r3,r3,-$BNSZ
1602         addi    r5,r5,-$BNSZ
1603         mtctr   r6
1604 Lppcasm_add_mainloop:
1605         $LDU    r7,$BNSZ(r4)
1606         $LDU    r8,$BNSZ(r5)
1607         adde    r8,r7,r8
1608         $STU    r8,$BNSZ(r3)
1609         bdnz    Lppcasm_add_mainloop
1610 Lppcasm_add_adios:
1611         addze   r3,r0                   #return carry bit.
1612         blr
1613         .long   0
1614         .byte   0,12,0x14,0,0,0,4,0
1615         .long   0
1616 .size   .bn_add_words,.-.bn_add_words
1617
1618 #
1619 #       NOTE:   The following label name should be changed to
1620 #               "bn_div_words" i.e. remove the first dot
1621 #               for the gcc compiler. This should be automatically
1622 #               done in the build
1623 #
1624
1625 .align  4
1626 .bn_div_words:
1627 #
1628 #       This is a cleaned up version of code generated by
1629 #       the AIX compiler. The only optimization is to use
1630 #       the PPC instruction to count leading zeros instead
1631 #       of call to num_bits_word. Since this was compiled
1632 #       only at level -O2 we can possibly squeeze it more?
1633 #
1634 #       r3 = h
1635 #       r4 = l
1636 #       r5 = d
1637
1638         $UCMPI  0,r5,0                  # compare r5 and 0
1639         bne     Lppcasm_div1            # proceed if d!=0
1640         li      r3,-1                   # d=0 return -1
1641         blr
1642 Lppcasm_div1:
1643         xor     r0,r0,r0                #r0=0
1644         li      r8,$BITS
1645         $CNTLZ. r7,r5                   #r7 = num leading 0s in d.
1646         beq     Lppcasm_div2            #proceed if no leading zeros
1647         subf    r8,r7,r8                #r8 = BN_num_bits_word(d)
1648         $SHR.   r9,r3,r8                #are there any bits above r8'th?
1649         $TR     16,r9,r0                #if there're, signal to dump core...
1650 Lppcasm_div2:
1651         $UCMP   0,r3,r5                 #h>=d?
1652         blt     Lppcasm_div3            #goto Lppcasm_div3 if not
1653         subf    r3,r5,r3                #h-=d ;
1654 Lppcasm_div3:                           #r7 = BN_BITS2-i. so r7=i
1655         cmpi    0,0,r7,0                # is (i == 0)?
1656         beq     Lppcasm_div4
1657         $SHL    r3,r3,r7                # h = (h<< i)
1658         $SHR    r8,r4,r8                # r8 = (l >> BN_BITS2 -i)
1659         $SHL    r5,r5,r7                # d<<=i
1660         or      r3,r3,r8                # h = (h<<i)|(l>>(BN_BITS2-i))
1661         $SHL    r4,r4,r7                # l <<=i
1662 Lppcasm_div4:
1663         $SHRI   r9,r5,`$BITS/2`         # r9 = dh
1664                                         # dl will be computed when needed
1665                                         # as it saves registers.
1666         li      r6,2                    #r6=2
1667         mtctr   r6                      #counter will be in count.
1668 Lppcasm_divouterloop:
1669         $SHRI   r8,r3,`$BITS/2`         #r8 = (h>>BN_BITS4)
1670         $SHRI   r11,r4,`$BITS/2`        #r11= (l&BN_MASK2h)>>BN_BITS4
1671                                         # compute here for innerloop.
1672         $UCMP   0,r8,r9                 # is (h>>BN_BITS4)==dh
1673         bne     Lppcasm_div5            # goto Lppcasm_div5 if not
1674
1675         li      r8,-1
1676         $CLRU   r8,r8,`$BITS/2`         #q = BN_MASK2l
1677         b       Lppcasm_div6
1678 Lppcasm_div5:
1679         $UDIV   r8,r3,r9                #q = h/dh
1680 Lppcasm_div6:
1681         $UMULL  r12,r9,r8               #th = q*dh
1682         $CLRU   r10,r5,`$BITS/2`        #r10=dl
1683         $UMULL  r6,r8,r10               #tl = q*dl
1684
1685 Lppcasm_divinnerloop:
1686         subf    r10,r12,r3              #t = h -th
1687         $SHRI   r7,r10,`$BITS/2`        #r7= (t &BN_MASK2H), sort of...
1688         addic.  r7,r7,0                 #test if r7 == 0. used below.
1689                                         # now want to compute
1690                                         # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1691                                         # the following 2 instructions do that
1692         $SHLI   r7,r10,`$BITS/2`        # r7 = (t<<BN_BITS4)
1693         or      r7,r7,r11               # r7|=((l&BN_MASK2h)>>BN_BITS4)
1694         $UCMP   cr1,r6,r7               # compare (tl <= r7)
1695         bne     Lppcasm_divinnerexit
1696         ble     cr1,Lppcasm_divinnerexit
1697         addi    r8,r8,-1                #q--
1698         subf    r12,r9,r12              #th -=dh
1699         $CLRU   r10,r5,`$BITS/2`        #r10=dl. t is no longer needed in loop.
1700         subf    r6,r10,r6               #tl -=dl
1701         b       Lppcasm_divinnerloop
1702 Lppcasm_divinnerexit:
1703         $SHRI   r10,r6,`$BITS/2`        #t=(tl>>BN_BITS4)
1704         $SHLI   r11,r6,`$BITS/2`        #tl=(tl<<BN_BITS4)&BN_MASK2h;
1705         $UCMP   cr1,r4,r11              # compare l and tl
1706         add     r12,r12,r10             # th+=t
1707         bge     cr1,Lppcasm_div7        # if (l>=tl) goto Lppcasm_div7
1708         addi    r12,r12,1               # th++
1709 Lppcasm_div7:
1710         subf    r11,r11,r4              #r11=l-tl
1711         $UCMP   cr1,r3,r12              #compare h and th
1712         bge     cr1,Lppcasm_div8        #if (h>=th) goto Lppcasm_div8
1713         addi    r8,r8,-1                # q--
1714         add     r3,r5,r3                # h+=d
1715 Lppcasm_div8:
1716         subf    r12,r12,r3              #r12 = h-th
1717         $SHLI   r4,r11,`$BITS/2`        #l=(l&BN_MASK2l)<<BN_BITS4
1718                                         # want to compute
1719                                         # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1720                                         # the following 2 instructions will do this.
1721         $INSR   r11,r12,`$BITS/2`,`$BITS/2`     # r11 is the value we want rotated $BITS/2.
1722         $ROTL   r3,r11,`$BITS/2`        # rotate by $BITS/2 and store in r3
1723         bdz     Lppcasm_div9            #if (count==0) break ;
1724         $SHLI   r0,r8,`$BITS/2`         #ret =q<<BN_BITS4
1725         b       Lppcasm_divouterloop
1726 Lppcasm_div9:
1727         or      r3,r8,r0
1728         blr
1729         .long   0
1730         .byte   0,12,0x14,0,0,0,3,0
1731         .long   0
1732 .size   .bn_div_words,.-.bn_div_words
1733
1734 #
1735 #       NOTE:   The following label name should be changed to
1736 #               "bn_sqr_words" i.e. remove the first dot
1737 #               for the gcc compiler. This should be automatically
1738 #               done in the build
1739 #
1740 .align  4
1741 .bn_sqr_words:
1742 #
1743 #       Optimized version of bn_sqr_words
1744 #
1745 #       void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1746 #
1747 #       r3 = r
1748 #       r4 = a
1749 #       r5 = n
1750 #
1751 #       r6 = a[i].
1752 #       r7,r8 = product.
1753 #
1754 #       No unrolling done here. Not performance critical.
1755
1756         addic.  r5,r5,0                 #test r5.
1757         beq     Lppcasm_sqr_adios
1758         addi    r4,r4,-$BNSZ
1759         addi    r3,r3,-$BNSZ
1760         mtctr   r5
1761 Lppcasm_sqr_mainloop:
1762                                         #sqr(r[0],r[1],a[0]);
1763         $LDU    r6,$BNSZ(r4)
1764         $UMULL  r7,r6,r6
1765         $UMULH  r8,r6,r6
1766         $STU    r7,$BNSZ(r3)
1767         $STU    r8,$BNSZ(r3)
1768         bdnz    Lppcasm_sqr_mainloop
1769 Lppcasm_sqr_adios:
1770         blr
1771         .long   0
1772         .byte   0,12,0x14,0,0,0,3,0
1773         .long   0
1774 .size   .bn_sqr_words,.-.bn_sqr_words
1775
1776 #
1777 #       NOTE:   The following label name should be changed to
1778 #               "bn_mul_words" i.e. remove the first dot
1779 #               for the gcc compiler. This should be automatically
1780 #               done in the build
1781 #
1782
1783 .align  4
1784 .bn_mul_words:
1785 #
1786 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1787 #
1788 # r3 = rp
1789 # r4 = ap
1790 # r5 = num
1791 # r6 = w
1792         xor     r0,r0,r0
1793         xor     r12,r12,r12             # used for carry
1794         rlwinm. r7,r5,30,2,31           # num >> 2
1795         beq     Lppcasm_mw_REM
1796         mtctr   r7
1797 Lppcasm_mw_LOOP:
1798                                         #mul(rp[0],ap[0],w,c1);
1799         $LD     r8,`0*$BNSZ`(r4)
1800         $UMULL  r9,r6,r8
1801         $UMULH  r10,r6,r8
1802         addc    r9,r9,r12
1803         #addze  r10,r10                 #carry is NOT ignored.
1804                                         #will be taken care of
1805                                         #in second spin below
1806                                         #using adde.
1807         $ST     r9,`0*$BNSZ`(r3)
1808                                         #mul(rp[1],ap[1],w,c1);
1809         $LD     r8,`1*$BNSZ`(r4)
1810         $UMULL  r11,r6,r8
1811         $UMULH  r12,r6,r8
1812         adde    r11,r11,r10
1813         #addze  r12,r12
1814         $ST     r11,`1*$BNSZ`(r3)
1815                                         #mul(rp[2],ap[2],w,c1);
1816         $LD     r8,`2*$BNSZ`(r4)
1817         $UMULL  r9,r6,r8
1818         $UMULH  r10,r6,r8
1819         adde    r9,r9,r12
1820         #addze  r10,r10
1821         $ST     r9,`2*$BNSZ`(r3)
1822                                         #mul_add(rp[3],ap[3],w,c1);
1823         $LD     r8,`3*$BNSZ`(r4)
1824         $UMULL  r11,r6,r8
1825         $UMULH  r12,r6,r8
1826         adde    r11,r11,r10
1827         addze   r12,r12                 #this spin we collect carry into
1828                                         #r12
1829         $ST     r11,`3*$BNSZ`(r3)
1830
1831         addi    r3,r3,`4*$BNSZ`
1832         addi    r4,r4,`4*$BNSZ`
1833         bdnz    Lppcasm_mw_LOOP
1834
1835 Lppcasm_mw_REM:
1836         andi.   r5,r5,0x3
1837         beq     Lppcasm_mw_OVER
1838                                         #mul(rp[0],ap[0],w,c1);
1839         $LD     r8,`0*$BNSZ`(r4)
1840         $UMULL  r9,r6,r8
1841         $UMULH  r10,r6,r8
1842         addc    r9,r9,r12
1843         addze   r10,r10
1844         $ST     r9,`0*$BNSZ`(r3)
1845         addi    r12,r10,0
1846
1847         addi    r5,r5,-1
1848         cmpli   0,0,r5,0
1849         beq     Lppcasm_mw_OVER
1850
1851
1852                                         #mul(rp[1],ap[1],w,c1);
1853         $LD     r8,`1*$BNSZ`(r4)
1854         $UMULL  r9,r6,r8
1855         $UMULH  r10,r6,r8
1856         addc    r9,r9,r12
1857         addze   r10,r10
1858         $ST     r9,`1*$BNSZ`(r3)
1859         addi    r12,r10,0
1860
1861         addi    r5,r5,-1
1862         cmpli   0,0,r5,0
1863         beq     Lppcasm_mw_OVER
1864
1865                                         #mul_add(rp[2],ap[2],w,c1);
1866         $LD     r8,`2*$BNSZ`(r4)
1867         $UMULL  r9,r6,r8
1868         $UMULH  r10,r6,r8
1869         addc    r9,r9,r12
1870         addze   r10,r10
1871         $ST     r9,`2*$BNSZ`(r3)
1872         addi    r12,r10,0
1873
1874 Lppcasm_mw_OVER:
1875         addi    r3,r12,0
1876         blr
1877         .long   0
1878         .byte   0,12,0x14,0,0,0,4,0
1879         .long   0
1880 .size   .bn_mul_words,.-.bn_mul_words
1881
1882 #
1883 #       NOTE:   The following label name should be changed to
1884 #               "bn_mul_add_words" i.e. remove the first dot
1885 #               for the gcc compiler. This should be automatically
1886 #               done in the build
1887 #
1888
1889 .align  4
1890 .bn_mul_add_words:
1891 #
1892 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1893 #
1894 # r3 = rp
1895 # r4 = ap
1896 # r5 = num
1897 # r6 = w
1898 #
1899 # empirical evidence suggests that unrolled version performs best!!
1900 #
1901         xor     r0,r0,r0                #r0 = 0
1902         xor     r12,r12,r12             #r12 = 0 . used for carry
1903         rlwinm. r7,r5,30,2,31           # num >> 2
1904         beq     Lppcasm_maw_leftover    # if (num < 4) go LPPCASM_maw_leftover
1905         mtctr   r7
1906 Lppcasm_maw_mainloop:
1907                                         #mul_add(rp[0],ap[0],w,c1);
1908         $LD     r8,`0*$BNSZ`(r4)
1909         $LD     r11,`0*$BNSZ`(r3)
1910         $UMULL  r9,r6,r8
1911         $UMULH  r10,r6,r8
1912         addc    r9,r9,r12               #r12 is carry.
1913         addze   r10,r10
1914         addc    r9,r9,r11
1915         #addze  r10,r10
1916                                         #the above instruction addze
1917                                         #is NOT needed. Carry will NOT
1918                                         #be ignored. It's not affected
1919                                         #by multiply and will be collected
1920                                         #in the next spin
1921         $ST     r9,`0*$BNSZ`(r3)
1922
1923                                         #mul_add(rp[1],ap[1],w,c1);
1924         $LD     r8,`1*$BNSZ`(r4)
1925         $LD     r9,`1*$BNSZ`(r3)
1926         $UMULL  r11,r6,r8
1927         $UMULH  r12,r6,r8
1928         adde    r11,r11,r10             #r10 is carry.
1929         addze   r12,r12
1930         addc    r11,r11,r9
1931         #addze  r12,r12
1932         $ST     r11,`1*$BNSZ`(r3)
1933
1934                                         #mul_add(rp[2],ap[2],w,c1);
1935         $LD     r8,`2*$BNSZ`(r4)
1936         $UMULL  r9,r6,r8
1937         $LD     r11,`2*$BNSZ`(r3)
1938         $UMULH  r10,r6,r8
1939         adde    r9,r9,r12
1940         addze   r10,r10
1941         addc    r9,r9,r11
1942         #addze  r10,r10
1943         $ST     r9,`2*$BNSZ`(r3)
1944
1945                                         #mul_add(rp[3],ap[3],w,c1);
1946         $LD     r8,`3*$BNSZ`(r4)
1947         $UMULL  r11,r6,r8
1948         $LD     r9,`3*$BNSZ`(r3)
1949         $UMULH  r12,r6,r8
1950         adde    r11,r11,r10
1951         addze   r12,r12
1952         addc    r11,r11,r9
1953         addze   r12,r12
1954         $ST     r11,`3*$BNSZ`(r3)
1955         addi    r3,r3,`4*$BNSZ`
1956         addi    r4,r4,`4*$BNSZ`
1957         bdnz    Lppcasm_maw_mainloop
1958
1959 Lppcasm_maw_leftover:
1960         andi.   r5,r5,0x3
1961         beq     Lppcasm_maw_adios
1962         addi    r3,r3,-$BNSZ
1963         addi    r4,r4,-$BNSZ
1964                                         #mul_add(rp[0],ap[0],w,c1);
1965         mtctr   r5
1966         $LDU    r8,$BNSZ(r4)
1967         $UMULL  r9,r6,r8
1968         $UMULH  r10,r6,r8
1969         $LDU    r11,$BNSZ(r3)
1970         addc    r9,r9,r11
1971         addze   r10,r10
1972         addc    r9,r9,r12
1973         addze   r12,r10
1974         $ST     r9,0(r3)
1975
1976         bdz     Lppcasm_maw_adios
1977                                         #mul_add(rp[1],ap[1],w,c1);
1978         $LDU    r8,$BNSZ(r4)
1979         $UMULL  r9,r6,r8
1980         $UMULH  r10,r6,r8
1981         $LDU    r11,$BNSZ(r3)
1982         addc    r9,r9,r11
1983         addze   r10,r10
1984         addc    r9,r9,r12
1985         addze   r12,r10
1986         $ST     r9,0(r3)
1987
1988         bdz     Lppcasm_maw_adios
1989                                         #mul_add(rp[2],ap[2],w,c1);
1990         $LDU    r8,$BNSZ(r4)
1991         $UMULL  r9,r6,r8
1992         $UMULH  r10,r6,r8
1993         $LDU    r11,$BNSZ(r3)
1994         addc    r9,r9,r11
1995         addze   r10,r10
1996         addc    r9,r9,r12
1997         addze   r12,r10
1998         $ST     r9,0(r3)
1999
2000 Lppcasm_maw_adios:
2001         addi    r3,r12,0
2002         blr
2003         .long   0
2004         .byte   0,12,0x14,0,0,0,4,0
2005         .long   0
2006 .size   .bn_mul_add_words,.-.bn_mul_add_words
2007         .align  4
2008 EOF
2009 $data =~ s/\`([^\`]*)\`/eval $1/gem;
2010 print $data;
2011 close STDOUT;