bn/asm/ppc.pl: harmonize .size directive in bn_mul_words.
[openssl.git] / crypto / bn / asm / ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 # Implemented as a Perl wrapper as we want to support several different
10 # architectures with single file. We pick up the target based on the
11 # file name we are asked to generate.
12 #
13 # It should be noted though that this perl code is nothing like
14 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
15 # as pre-processor to cover for platform differences in name decoration,
16 # linker tables, 32-/64-bit instruction sets...
17 #
18 # As you might know there're several PowerPC ABI in use. Most notably
19 # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
20 # are similar enough to implement leaf(!) functions, which would be ABI
21 # neutral. And that's what you find here: ABI neutral leaf functions.
22 # In case you wonder what that is...
23 #
24 #       AIX performance
25 #
26 #       MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
27 #
28 #       The following is the performance of 32-bit compiler
29 #       generated code:
30 #
31 #       OpenSSL 0.9.6c 21 dec 2001
32 #       built on: Tue Jun 11 11:06:51 EDT 2002
33 #       options:bn(64,32) ...
34 #compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
35 #                  sign    verify    sign/s verify/s
36 #rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
37 #rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
38 #rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
39 #rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
40 #dsa  512 bits   0.0087s   0.0106s    114.3     94.5
41 #dsa 1024 bits   0.0256s   0.0313s     39.0     32.0    
42 #
43 #       Same bechmark with this assembler code:
44 #
45 #rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
46 #rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
47 #rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
48 #rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
49 #dsa  512 bits   0.0052s   0.0062s    191.6    162.0
50 #dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
51 #
52 #       Number of operations increases by at almost 75%
53 #
54 #       Here are performance numbers for 64-bit compiler
55 #       generated code:
56 #
57 #       OpenSSL 0.9.6g [engine] 9 Aug 2002
58 #       built on: Fri Apr 18 16:59:20 EDT 2003
59 #       options:bn(64,64) ...
60 #       compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
61 #                  sign    verify    sign/s verify/s
62 #rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
63 #rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
64 #rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
65 #rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
66 #dsa  512 bits   0.0026s   0.0032s    382.5    313.7
67 #dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
68 #
69 #       Same benchmark with this assembler code:
70 #
71 #rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
72 #rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
73 #rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
74 #rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
75 #dsa  512 bits   0.0016s   0.0020s    610.7    507.1
76 #dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
77 #       
78 #       Again, performance increases by at about 75%
79 #
80 #       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
81 #       OpenSSL 0.9.7c 30 Sep 2003
82 #
83 #       Original code.
84 #
85 #rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
86 #rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
87 #rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
88 #rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
89 #dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
90 #dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
91 #dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
92 #
93 #       Same benchmark with this assembler code:
94 #
95 #rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
96 #rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
97 #rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
98 #rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
99 #dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
100 #dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
101 #dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
102 #
103 #        Performance increase of ~60%
104 #
105 #       If you have comments or suggestions to improve code send
106 #       me a note at schari@us.ibm.com
107 #
108
109 $flavour = shift;
110
111 if ($flavour =~ /32/) {
112         $BITS=  32;
113         $BNSZ=  $BITS/8;
114         $ISA=   "\"ppc\"";
115
116         $LD=    "lwz";          # load
117         $LDU=   "lwzu";         # load and update
118         $ST=    "stw";          # store
119         $STU=   "stwu";         # store and update
120         $UMULL= "mullw";        # unsigned multiply low
121         $UMULH= "mulhwu";       # unsigned multiply high
122         $UDIV=  "divwu";        # unsigned divide
123         $UCMPI= "cmplwi";       # unsigned compare with immediate
124         $UCMP=  "cmplw";        # unsigned compare
125         $CNTLZ= "cntlzw";       # count leading zeros
126         $SHL=   "slw";          # shift left
127         $SHR=   "srw";          # unsigned shift right
128         $SHRI=  "srwi";         # unsigned shift right by immediate     
129         $SHLI=  "slwi";         # shift left by immediate
130         $CLRU=  "clrlwi";       # clear upper bits
131         $INSR=  "insrwi";       # insert right
132         $ROTL=  "rotlwi";       # rotate left by immediate
133         $TR=    "tw";           # conditional trap
134 } elsif ($flavour =~ /64/) {
135         $BITS=  64;
136         $BNSZ=  $BITS/8;
137         $ISA=   "\"ppc64\"";
138
139         # same as above, but 64-bit mnemonics...
140         $LD=    "ld";           # load
141         $LDU=   "ldu";          # load and update
142         $ST=    "std";          # store
143         $STU=   "stdu";         # store and update
144         $UMULL= "mulld";        # unsigned multiply low
145         $UMULH= "mulhdu";       # unsigned multiply high
146         $UDIV=  "divdu";        # unsigned divide
147         $UCMPI= "cmpldi";       # unsigned compare with immediate
148         $UCMP=  "cmpld";        # unsigned compare
149         $CNTLZ= "cntlzd";       # count leading zeros
150         $SHL=   "sld";          # shift left
151         $SHR=   "srd";          # unsigned shift right
152         $SHRI=  "srdi";         # unsigned shift right by immediate     
153         $SHLI=  "sldi";         # shift left by immediate
154         $CLRU=  "clrldi";       # clear upper bits
155         $INSR=  "insrdi";       # insert right 
156         $ROTL=  "rotldi";       # rotate left by immediate
157         $TR=    "td";           # conditional trap
158 } else { die "nonsense $flavour"; }
159
160 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
161 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
162 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
163 die "can't locate ppc-xlate.pl";
164
165 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
166
167 $data=<<EOF;
168 #--------------------------------------------------------------------
169 #
170 #
171 #
172 #
173 #       File:           ppc32.s
174 #
175 #       Created by:     Suresh Chari
176 #                       IBM Thomas J. Watson Research Library
177 #                       Hawthorne, NY
178 #
179 #
180 #       Description:    Optimized assembly routines for OpenSSL crypto
181 #                       on the 32 bitPowerPC platform.
182 #
183 #
184 #       Version History
185 #
186 #       2. Fixed bn_add,bn_sub and bn_div_words, added comments,
187 #          cleaned up code. Also made a single version which can
188 #          be used for both the AIX and Linux compilers. See NOTE
189 #          below.
190 #                               12/05/03                Suresh Chari
191 #                       (with lots of help from)        Andy Polyakov
192 ##      
193 #       1. Initial version      10/20/02                Suresh Chari
194 #
195 #
196 #       The following file works for the xlc,cc
197 #       and gcc compilers.
198 #
199 #       NOTE:   To get the file to link correctly with the gcc compiler
200 #               you have to change the names of the routines and remove
201 #               the first .(dot) character. This should automatically
202 #               be done in the build process.
203 #
204 #       Hand optimized assembly code for the following routines
205 #       
206 #       bn_sqr_comba4
207 #       bn_sqr_comba8
208 #       bn_mul_comba4
209 #       bn_mul_comba8
210 #       bn_sub_words
211 #       bn_add_words
212 #       bn_div_words
213 #       bn_sqr_words
214 #       bn_mul_words
215 #       bn_mul_add_words
216 #
217 #       NOTE:   It is possible to optimize this code more for
218 #       specific PowerPC or Power architectures. On the Northstar
219 #       architecture the optimizations in this file do
220 #        NOT provide much improvement.
221 #
222 #       If you have comments or suggestions to improve code send
223 #       me a note at schari\@us.ibm.com
224 #
225 #--------------------------------------------------------------------------
226 #
227 #       Defines to be used in the assembly code.
228 #       
229 #.set r0,0      # we use it as storage for value of 0
230 #.set SP,1      # preserved
231 #.set RTOC,2    # preserved 
232 #.set r3,3      # 1st argument/return value
233 #.set r4,4      # 2nd argument/volatile register
234 #.set r5,5      # 3rd argument/volatile register
235 #.set r6,6      # ...
236 #.set r7,7
237 #.set r8,8
238 #.set r9,9
239 #.set r10,10
240 #.set r11,11
241 #.set r12,12
242 #.set r13,13    # not used, nor any other "below" it...
243
244 #       Declare function names to be global
245 #       NOTE:   For gcc these names MUST be changed to remove
246 #               the first . i.e. for example change ".bn_sqr_comba4"
247 #               to "bn_sqr_comba4". This should be automatically done
248 #               in the build.
249         
250         .globl  .bn_sqr_comba4
251         .globl  .bn_sqr_comba8
252         .globl  .bn_mul_comba4
253         .globl  .bn_mul_comba8
254         .globl  .bn_sub_words
255         .globl  .bn_add_words
256         .globl  .bn_div_words
257         .globl  .bn_sqr_words
258         .globl  .bn_mul_words
259         .globl  .bn_mul_add_words
260         
261 # .text section
262         
263         .machine        "any"
264
265 #
266 #       NOTE:   The following label name should be changed to
267 #               "bn_sqr_comba4" i.e. remove the first dot
268 #               for the gcc compiler. This should be automatically
269 #               done in the build
270 #
271
272 .align  4
273 .bn_sqr_comba4:
274 #
275 # Optimized version of bn_sqr_comba4.
276 #
277 # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
278 # r3 contains r
279 # r4 contains a
280 #
281 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:       
282
283 # r5,r6 are the two BN_ULONGs being multiplied.
284 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
285 # r9,r10, r11 are the equivalents of c1,c2, c3.
286 # Here's the assembly
287 #
288 #
289         xor             r0,r0,r0                # set r0 = 0. Used in the addze
290                                                 # instructions below
291         
292                                                 #sqr_add_c(a,0,c1,c2,c3)
293         $LD             r5,`0*$BNSZ`(r4)                
294         $UMULL          r9,r5,r5                
295         $UMULH          r10,r5,r5               #in first iteration. No need
296                                                 #to add since c1=c2=c3=0.
297                                                 # Note c3(r11) is NOT set to 0
298                                                 # but will be.
299
300         $ST             r9,`0*$BNSZ`(r3)        # r[0]=c1;
301                                                 # sqr_add_c2(a,1,0,c2,c3,c1);
302         $LD             r6,`1*$BNSZ`(r4)                
303         $UMULL          r7,r5,r6
304         $UMULH          r8,r5,r6
305                                         
306         addc            r7,r7,r7                # compute (r7,r8)=2*(r7,r8)
307         adde            r8,r8,r8
308         addze           r9,r0                   # catch carry if any.
309                                                 # r9= r0(=0) and carry 
310         
311         addc            r10,r7,r10              # now add to temp result.
312         addze           r11,r8                  # r8 added to r11 which is 0 
313         addze           r9,r9
314         
315         $ST             r10,`1*$BNSZ`(r3)       #r[1]=c2; 
316                                                 #sqr_add_c(a,1,c3,c1,c2)
317         $UMULL          r7,r6,r6
318         $UMULH          r8,r6,r6
319         addc            r11,r7,r11
320         adde            r9,r8,r9
321         addze           r10,r0
322                                                 #sqr_add_c2(a,2,0,c3,c1,c2)
323         $LD             r6,`2*$BNSZ`(r4)
324         $UMULL          r7,r5,r6
325         $UMULH          r8,r5,r6
326         
327         addc            r7,r7,r7
328         adde            r8,r8,r8
329         addze           r10,r10
330         
331         addc            r11,r7,r11
332         adde            r9,r8,r9
333         addze           r10,r10
334         $ST             r11,`2*$BNSZ`(r3)       #r[2]=c3 
335                                                 #sqr_add_c2(a,3,0,c1,c2,c3);
336         $LD             r6,`3*$BNSZ`(r4)                
337         $UMULL          r7,r5,r6
338         $UMULH          r8,r5,r6
339         addc            r7,r7,r7
340         adde            r8,r8,r8
341         addze           r11,r0
342         
343         addc            r9,r7,r9
344         adde            r10,r8,r10
345         addze           r11,r11
346                                                 #sqr_add_c2(a,2,1,c1,c2,c3);
347         $LD             r5,`1*$BNSZ`(r4)
348         $LD             r6,`2*$BNSZ`(r4)
349         $UMULL          r7,r5,r6
350         $UMULH          r8,r5,r6
351         
352         addc            r7,r7,r7
353         adde            r8,r8,r8
354         addze           r11,r11
355         addc            r9,r7,r9
356         adde            r10,r8,r10
357         addze           r11,r11
358         $ST             r9,`3*$BNSZ`(r3)        #r[3]=c1
359                                                 #sqr_add_c(a,2,c2,c3,c1);
360         $UMULL          r7,r6,r6
361         $UMULH          r8,r6,r6
362         addc            r10,r7,r10
363         adde            r11,r8,r11
364         addze           r9,r0
365                                                 #sqr_add_c2(a,3,1,c2,c3,c1);
366         $LD             r6,`3*$BNSZ`(r4)                
367         $UMULL          r7,r5,r6
368         $UMULH          r8,r5,r6
369         addc            r7,r7,r7
370         adde            r8,r8,r8
371         addze           r9,r9
372         
373         addc            r10,r7,r10
374         adde            r11,r8,r11
375         addze           r9,r9
376         $ST             r10,`4*$BNSZ`(r3)       #r[4]=c2
377                                                 #sqr_add_c2(a,3,2,c3,c1,c2);
378         $LD             r5,`2*$BNSZ`(r4)                
379         $UMULL          r7,r5,r6
380         $UMULH          r8,r5,r6
381         addc            r7,r7,r7
382         adde            r8,r8,r8
383         addze           r10,r0
384         
385         addc            r11,r7,r11
386         adde            r9,r8,r9
387         addze           r10,r10
388         $ST             r11,`5*$BNSZ`(r3)       #r[5] = c3
389                                                 #sqr_add_c(a,3,c1,c2,c3);
390         $UMULL          r7,r6,r6                
391         $UMULH          r8,r6,r6
392         addc            r9,r7,r9
393         adde            r10,r8,r10
394
395         $ST             r9,`6*$BNSZ`(r3)        #r[6]=c1
396         $ST             r10,`7*$BNSZ`(r3)       #r[7]=c2
397         blr
398         .long   0
399         .byte   0,12,0x14,0,0,0,2,0
400         .long   0
401 .size   .bn_sqr_comba4,.-.bn_sqr_comba4
402
403 #
404 #       NOTE:   The following label name should be changed to
405 #               "bn_sqr_comba8" i.e. remove the first dot
406 #               for the gcc compiler. This should be automatically
407 #               done in the build
408 #
409         
410 .align  4
411 .bn_sqr_comba8:
412 #
413 # This is an optimized version of the bn_sqr_comba8 routine.
414 # Tightly uses the adde instruction
415 #
416 #
417 # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
418 # r3 contains r
419 # r4 contains a
420 #
421 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:       
422
423 # r5,r6 are the two BN_ULONGs being multiplied.
424 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
425 # r9,r10, r11 are the equivalents of c1,c2, c3.
426 #
427 # Possible optimization of loading all 8 longs of a into registers
428 # doesn't provide any speedup
429
430
431         xor             r0,r0,r0                #set r0 = 0.Used in addze
432                                                 #instructions below.
433
434                                                 #sqr_add_c(a,0,c1,c2,c3);
435         $LD             r5,`0*$BNSZ`(r4)
436         $UMULL          r9,r5,r5                #1st iteration: no carries.
437         $UMULH          r10,r5,r5
438         $ST             r9,`0*$BNSZ`(r3)        # r[0]=c1;
439                                                 #sqr_add_c2(a,1,0,c2,c3,c1);
440         $LD             r6,`1*$BNSZ`(r4)
441         $UMULL          r7,r5,r6
442         $UMULH          r8,r5,r6        
443         
444         addc            r10,r7,r10              #add the two register number
445         adde            r11,r8,r0               # (r8,r7) to the three register
446         addze           r9,r0                   # number (r9,r11,r10).NOTE:r0=0
447         
448         addc            r10,r7,r10              #add the two register number
449         adde            r11,r8,r11              # (r8,r7) to the three register
450         addze           r9,r9                   # number (r9,r11,r10).
451         
452         $ST             r10,`1*$BNSZ`(r3)       # r[1]=c2
453                                 
454                                                 #sqr_add_c(a,1,c3,c1,c2);
455         $UMULL          r7,r6,r6
456         $UMULH          r8,r6,r6
457         addc            r11,r7,r11
458         adde            r9,r8,r9
459         addze           r10,r0
460                                                 #sqr_add_c2(a,2,0,c3,c1,c2);
461         $LD             r6,`2*$BNSZ`(r4)
462         $UMULL          r7,r5,r6
463         $UMULH          r8,r5,r6
464         
465         addc            r11,r7,r11
466         adde            r9,r8,r9
467         addze           r10,r10
468         
469         addc            r11,r7,r11
470         adde            r9,r8,r9
471         addze           r10,r10
472         
473         $ST             r11,`2*$BNSZ`(r3)       #r[2]=c3
474                                                 #sqr_add_c2(a,3,0,c1,c2,c3);
475         $LD             r6,`3*$BNSZ`(r4)        #r6 = a[3]. r5 is already a[0].
476         $UMULL          r7,r5,r6
477         $UMULH          r8,r5,r6
478         
479         addc            r9,r7,r9
480         adde            r10,r8,r10
481         addze           r11,r0
482         
483         addc            r9,r7,r9
484         adde            r10,r8,r10
485         addze           r11,r11
486                                                 #sqr_add_c2(a,2,1,c1,c2,c3);
487         $LD             r5,`1*$BNSZ`(r4)
488         $LD             r6,`2*$BNSZ`(r4)
489         $UMULL          r7,r5,r6
490         $UMULH          r8,r5,r6
491         
492         addc            r9,r7,r9
493         adde            r10,r8,r10
494         addze           r11,r11
495         
496         addc            r9,r7,r9
497         adde            r10,r8,r10
498         addze           r11,r11
499         
500         $ST             r9,`3*$BNSZ`(r3)        #r[3]=c1;
501                                                 #sqr_add_c(a,2,c2,c3,c1);
502         $UMULL          r7,r6,r6
503         $UMULH          r8,r6,r6
504         
505         addc            r10,r7,r10
506         adde            r11,r8,r11
507         addze           r9,r0
508                                                 #sqr_add_c2(a,3,1,c2,c3,c1);
509         $LD             r6,`3*$BNSZ`(r4)
510         $UMULL          r7,r5,r6
511         $UMULH          r8,r5,r6
512         
513         addc            r10,r7,r10
514         adde            r11,r8,r11
515         addze           r9,r9
516         
517         addc            r10,r7,r10
518         adde            r11,r8,r11
519         addze           r9,r9
520                                                 #sqr_add_c2(a,4,0,c2,c3,c1);
521         $LD             r5,`0*$BNSZ`(r4)
522         $LD             r6,`4*$BNSZ`(r4)
523         $UMULL          r7,r5,r6
524         $UMULH          r8,r5,r6
525         
526         addc            r10,r7,r10
527         adde            r11,r8,r11
528         addze           r9,r9
529         
530         addc            r10,r7,r10
531         adde            r11,r8,r11
532         addze           r9,r9
533         $ST             r10,`4*$BNSZ`(r3)       #r[4]=c2;
534                                                 #sqr_add_c2(a,5,0,c3,c1,c2);
535         $LD             r6,`5*$BNSZ`(r4)
536         $UMULL          r7,r5,r6
537         $UMULH          r8,r5,r6
538         
539         addc            r11,r7,r11
540         adde            r9,r8,r9
541         addze           r10,r0
542         
543         addc            r11,r7,r11
544         adde            r9,r8,r9
545         addze           r10,r10
546                                                 #sqr_add_c2(a,4,1,c3,c1,c2);
547         $LD             r5,`1*$BNSZ`(r4)
548         $LD             r6,`4*$BNSZ`(r4)
549         $UMULL          r7,r5,r6
550         $UMULH          r8,r5,r6
551         
552         addc            r11,r7,r11
553         adde            r9,r8,r9
554         addze           r10,r10
555         
556         addc            r11,r7,r11
557         adde            r9,r8,r9
558         addze           r10,r10
559                                                 #sqr_add_c2(a,3,2,c3,c1,c2);
560         $LD             r5,`2*$BNSZ`(r4)
561         $LD             r6,`3*$BNSZ`(r4)
562         $UMULL          r7,r5,r6
563         $UMULH          r8,r5,r6
564         
565         addc            r11,r7,r11
566         adde            r9,r8,r9
567         addze           r10,r10
568         
569         addc            r11,r7,r11
570         adde            r9,r8,r9
571         addze           r10,r10
572         $ST             r11,`5*$BNSZ`(r3)       #r[5]=c3;
573                                                 #sqr_add_c(a,3,c1,c2,c3);
574         $UMULL          r7,r6,r6
575         $UMULH          r8,r6,r6
576         addc            r9,r7,r9
577         adde            r10,r8,r10
578         addze           r11,r0
579                                                 #sqr_add_c2(a,4,2,c1,c2,c3);
580         $LD             r6,`4*$BNSZ`(r4)
581         $UMULL          r7,r5,r6
582         $UMULH          r8,r5,r6
583         
584         addc            r9,r7,r9
585         adde            r10,r8,r10
586         addze           r11,r11
587         
588         addc            r9,r7,r9
589         adde            r10,r8,r10
590         addze           r11,r11
591                                                 #sqr_add_c2(a,5,1,c1,c2,c3);
592         $LD             r5,`1*$BNSZ`(r4)
593         $LD             r6,`5*$BNSZ`(r4)
594         $UMULL          r7,r5,r6
595         $UMULH          r8,r5,r6
596         
597         addc            r9,r7,r9
598         adde            r10,r8,r10
599         addze           r11,r11
600         
601         addc            r9,r7,r9
602         adde            r10,r8,r10
603         addze           r11,r11
604                                                 #sqr_add_c2(a,6,0,c1,c2,c3);
605         $LD             r5,`0*$BNSZ`(r4)
606         $LD             r6,`6*$BNSZ`(r4)
607         $UMULL          r7,r5,r6
608         $UMULH          r8,r5,r6
609         addc            r9,r7,r9
610         adde            r10,r8,r10
611         addze           r11,r11
612         addc            r9,r7,r9
613         adde            r10,r8,r10
614         addze           r11,r11
615         $ST             r9,`6*$BNSZ`(r3)        #r[6]=c1;
616                                                 #sqr_add_c2(a,7,0,c2,c3,c1);
617         $LD             r6,`7*$BNSZ`(r4)
618         $UMULL          r7,r5,r6
619         $UMULH          r8,r5,r6
620         
621         addc            r10,r7,r10
622         adde            r11,r8,r11
623         addze           r9,r0
624         addc            r10,r7,r10
625         adde            r11,r8,r11
626         addze           r9,r9
627                                                 #sqr_add_c2(a,6,1,c2,c3,c1);
628         $LD             r5,`1*$BNSZ`(r4)
629         $LD             r6,`6*$BNSZ`(r4)
630         $UMULL          r7,r5,r6
631         $UMULH          r8,r5,r6
632         
633         addc            r10,r7,r10
634         adde            r11,r8,r11
635         addze           r9,r9
636         addc            r10,r7,r10
637         adde            r11,r8,r11
638         addze           r9,r9
639                                                 #sqr_add_c2(a,5,2,c2,c3,c1);
640         $LD             r5,`2*$BNSZ`(r4)
641         $LD             r6,`5*$BNSZ`(r4)
642         $UMULL          r7,r5,r6
643         $UMULH          r8,r5,r6
644         addc            r10,r7,r10
645         adde            r11,r8,r11
646         addze           r9,r9
647         addc            r10,r7,r10
648         adde            r11,r8,r11
649         addze           r9,r9
650                                                 #sqr_add_c2(a,4,3,c2,c3,c1);
651         $LD             r5,`3*$BNSZ`(r4)
652         $LD             r6,`4*$BNSZ`(r4)
653         $UMULL          r7,r5,r6
654         $UMULH          r8,r5,r6
655         
656         addc            r10,r7,r10
657         adde            r11,r8,r11
658         addze           r9,r9
659         addc            r10,r7,r10
660         adde            r11,r8,r11
661         addze           r9,r9
662         $ST             r10,`7*$BNSZ`(r3)       #r[7]=c2;
663                                                 #sqr_add_c(a,4,c3,c1,c2);
664         $UMULL          r7,r6,r6
665         $UMULH          r8,r6,r6
666         addc            r11,r7,r11
667         adde            r9,r8,r9
668         addze           r10,r0
669                                                 #sqr_add_c2(a,5,3,c3,c1,c2);
670         $LD             r6,`5*$BNSZ`(r4)
671         $UMULL          r7,r5,r6
672         $UMULH          r8,r5,r6
673         addc            r11,r7,r11
674         adde            r9,r8,r9
675         addze           r10,r10
676         addc            r11,r7,r11
677         adde            r9,r8,r9
678         addze           r10,r10
679                                                 #sqr_add_c2(a,6,2,c3,c1,c2);
680         $LD             r5,`2*$BNSZ`(r4)
681         $LD             r6,`6*$BNSZ`(r4)
682         $UMULL          r7,r5,r6
683         $UMULH          r8,r5,r6
684         addc            r11,r7,r11
685         adde            r9,r8,r9
686         addze           r10,r10
687         
688         addc            r11,r7,r11
689         adde            r9,r8,r9
690         addze           r10,r10
691                                                 #sqr_add_c2(a,7,1,c3,c1,c2);
692         $LD             r5,`1*$BNSZ`(r4)
693         $LD             r6,`7*$BNSZ`(r4)
694         $UMULL          r7,r5,r6
695         $UMULH          r8,r5,r6
696         addc            r11,r7,r11
697         adde            r9,r8,r9
698         addze           r10,r10
699         addc            r11,r7,r11
700         adde            r9,r8,r9
701         addze           r10,r10
702         $ST             r11,`8*$BNSZ`(r3)       #r[8]=c3;
703                                                 #sqr_add_c2(a,7,2,c1,c2,c3);
704         $LD             r5,`2*$BNSZ`(r4)
705         $UMULL          r7,r5,r6
706         $UMULH          r8,r5,r6
707         
708         addc            r9,r7,r9
709         adde            r10,r8,r10
710         addze           r11,r0
711         addc            r9,r7,r9
712         adde            r10,r8,r10
713         addze           r11,r11
714                                                 #sqr_add_c2(a,6,3,c1,c2,c3);
715         $LD             r5,`3*$BNSZ`(r4)
716         $LD             r6,`6*$BNSZ`(r4)
717         $UMULL          r7,r5,r6
718         $UMULH          r8,r5,r6
719         addc            r9,r7,r9
720         adde            r10,r8,r10
721         addze           r11,r11
722         addc            r9,r7,r9
723         adde            r10,r8,r10
724         addze           r11,r11
725                                                 #sqr_add_c2(a,5,4,c1,c2,c3);
726         $LD             r5,`4*$BNSZ`(r4)
727         $LD             r6,`5*$BNSZ`(r4)
728         $UMULL          r7,r5,r6
729         $UMULH          r8,r5,r6
730         addc            r9,r7,r9
731         adde            r10,r8,r10
732         addze           r11,r11
733         addc            r9,r7,r9
734         adde            r10,r8,r10
735         addze           r11,r11
736         $ST             r9,`9*$BNSZ`(r3)        #r[9]=c1;
737                                                 #sqr_add_c(a,5,c2,c3,c1);
738         $UMULL          r7,r6,r6
739         $UMULH          r8,r6,r6
740         addc            r10,r7,r10
741         adde            r11,r8,r11
742         addze           r9,r0
743                                                 #sqr_add_c2(a,6,4,c2,c3,c1);
744         $LD             r6,`6*$BNSZ`(r4)
745         $UMULL          r7,r5,r6
746         $UMULH          r8,r5,r6
747         addc            r10,r7,r10
748         adde            r11,r8,r11
749         addze           r9,r9
750         addc            r10,r7,r10
751         adde            r11,r8,r11
752         addze           r9,r9
753                                                 #sqr_add_c2(a,7,3,c2,c3,c1);
754         $LD             r5,`3*$BNSZ`(r4)
755         $LD             r6,`7*$BNSZ`(r4)
756         $UMULL          r7,r5,r6
757         $UMULH          r8,r5,r6
758         addc            r10,r7,r10
759         adde            r11,r8,r11
760         addze           r9,r9
761         addc            r10,r7,r10
762         adde            r11,r8,r11
763         addze           r9,r9
764         $ST             r10,`10*$BNSZ`(r3)      #r[10]=c2;
765                                                 #sqr_add_c2(a,7,4,c3,c1,c2);
766         $LD             r5,`4*$BNSZ`(r4)
767         $UMULL          r7,r5,r6
768         $UMULH          r8,r5,r6
769         addc            r11,r7,r11
770         adde            r9,r8,r9
771         addze           r10,r0
772         addc            r11,r7,r11
773         adde            r9,r8,r9
774         addze           r10,r10
775                                                 #sqr_add_c2(a,6,5,c3,c1,c2);
776         $LD             r5,`5*$BNSZ`(r4)
777         $LD             r6,`6*$BNSZ`(r4)
778         $UMULL          r7,r5,r6
779         $UMULH          r8,r5,r6
780         addc            r11,r7,r11
781         adde            r9,r8,r9
782         addze           r10,r10
783         addc            r11,r7,r11
784         adde            r9,r8,r9
785         addze           r10,r10
786         $ST             r11,`11*$BNSZ`(r3)      #r[11]=c3;
787                                                 #sqr_add_c(a,6,c1,c2,c3);
788         $UMULL          r7,r6,r6
789         $UMULH          r8,r6,r6
790         addc            r9,r7,r9
791         adde            r10,r8,r10
792         addze           r11,r0
793                                                 #sqr_add_c2(a,7,5,c1,c2,c3)
794         $LD             r6,`7*$BNSZ`(r4)
795         $UMULL          r7,r5,r6
796         $UMULH          r8,r5,r6
797         addc            r9,r7,r9
798         adde            r10,r8,r10
799         addze           r11,r11
800         addc            r9,r7,r9
801         adde            r10,r8,r10
802         addze           r11,r11
803         $ST             r9,`12*$BNSZ`(r3)       #r[12]=c1;
804         
805                                                 #sqr_add_c2(a,7,6,c2,c3,c1)
806         $LD             r5,`6*$BNSZ`(r4)
807         $UMULL          r7,r5,r6
808         $UMULH          r8,r5,r6
809         addc            r10,r7,r10
810         adde            r11,r8,r11
811         addze           r9,r0
812         addc            r10,r7,r10
813         adde            r11,r8,r11
814         addze           r9,r9
815         $ST             r10,`13*$BNSZ`(r3)      #r[13]=c2;
816                                                 #sqr_add_c(a,7,c3,c1,c2);
817         $UMULL          r7,r6,r6
818         $UMULH          r8,r6,r6
819         addc            r11,r7,r11
820         adde            r9,r8,r9
821         $ST             r11,`14*$BNSZ`(r3)      #r[14]=c3;
822         $ST             r9, `15*$BNSZ`(r3)      #r[15]=c1;
823
824
825         blr
826         .long   0
827         .byte   0,12,0x14,0,0,0,2,0
828         .long   0
829 .size   .bn_sqr_comba8,.-.bn_sqr_comba8
830
831 #
832 #       NOTE:   The following label name should be changed to
833 #               "bn_mul_comba4" i.e. remove the first dot
834 #               for the gcc compiler. This should be automatically
835 #               done in the build
836 #
837
838 .align  4
839 .bn_mul_comba4:
840 #
841 # This is an optimized version of the bn_mul_comba4 routine.
842 #
843 # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
844 # r3 contains r
845 # r4 contains a
846 # r5 contains b
847 # r6, r7 are the 2 BN_ULONGs being multiplied.
848 # r8, r9 are the results of the 32x32 giving 64 multiply.
849 # r10, r11, r12 are the equivalents of c1, c2, and c3.
850 #
851         xor     r0,r0,r0                #r0=0. Used in addze below.
852                                         #mul_add_c(a[0],b[0],c1,c2,c3);
853         $LD     r6,`0*$BNSZ`(r4)                
854         $LD     r7,`0*$BNSZ`(r5)                
855         $UMULL  r10,r6,r7               
856         $UMULH  r11,r6,r7               
857         $ST     r10,`0*$BNSZ`(r3)       #r[0]=c1
858                                         #mul_add_c(a[0],b[1],c2,c3,c1);
859         $LD     r7,`1*$BNSZ`(r5)                
860         $UMULL  r8,r6,r7
861         $UMULH  r9,r6,r7
862         addc    r11,r8,r11
863         adde    r12,r9,r0
864         addze   r10,r0
865                                         #mul_add_c(a[1],b[0],c2,c3,c1);
866         $LD     r6, `1*$BNSZ`(r4)               
867         $LD     r7, `0*$BNSZ`(r5)               
868         $UMULL  r8,r6,r7
869         $UMULH  r9,r6,r7
870         addc    r11,r8,r11
871         adde    r12,r9,r12
872         addze   r10,r10
873         $ST     r11,`1*$BNSZ`(r3)       #r[1]=c2
874                                         #mul_add_c(a[2],b[0],c3,c1,c2);
875         $LD     r6,`2*$BNSZ`(r4)                
876         $UMULL  r8,r6,r7
877         $UMULH  r9,r6,r7
878         addc    r12,r8,r12
879         adde    r10,r9,r10
880         addze   r11,r0
881                                         #mul_add_c(a[1],b[1],c3,c1,c2);
882         $LD     r6,`1*$BNSZ`(r4)                
883         $LD     r7,`1*$BNSZ`(r5)                
884         $UMULL  r8,r6,r7
885         $UMULH  r9,r6,r7
886         addc    r12,r8,r12
887         adde    r10,r9,r10
888         addze   r11,r11
889                                         #mul_add_c(a[0],b[2],c3,c1,c2);
890         $LD     r6,`0*$BNSZ`(r4)                
891         $LD     r7,`2*$BNSZ`(r5)                
892         $UMULL  r8,r6,r7
893         $UMULH  r9,r6,r7
894         addc    r12,r8,r12
895         adde    r10,r9,r10
896         addze   r11,r11
897         $ST     r12,`2*$BNSZ`(r3)       #r[2]=c3
898                                         #mul_add_c(a[0],b[3],c1,c2,c3);
899         $LD     r7,`3*$BNSZ`(r5)                
900         $UMULL  r8,r6,r7
901         $UMULH  r9,r6,r7
902         addc    r10,r8,r10
903         adde    r11,r9,r11
904         addze   r12,r0
905                                         #mul_add_c(a[1],b[2],c1,c2,c3);
906         $LD     r6,`1*$BNSZ`(r4)
907         $LD     r7,`2*$BNSZ`(r5)
908         $UMULL  r8,r6,r7
909         $UMULH  r9,r6,r7
910         addc    r10,r8,r10
911         adde    r11,r9,r11
912         addze   r12,r12
913                                         #mul_add_c(a[2],b[1],c1,c2,c3);
914         $LD     r6,`2*$BNSZ`(r4)
915         $LD     r7,`1*$BNSZ`(r5)
916         $UMULL  r8,r6,r7
917         $UMULH  r9,r6,r7
918         addc    r10,r8,r10
919         adde    r11,r9,r11
920         addze   r12,r12
921                                         #mul_add_c(a[3],b[0],c1,c2,c3);
922         $LD     r6,`3*$BNSZ`(r4)
923         $LD     r7,`0*$BNSZ`(r5)
924         $UMULL  r8,r6,r7
925         $UMULH  r9,r6,r7
926         addc    r10,r8,r10
927         adde    r11,r9,r11
928         addze   r12,r12
929         $ST     r10,`3*$BNSZ`(r3)       #r[3]=c1
930                                         #mul_add_c(a[3],b[1],c2,c3,c1);
931         $LD     r7,`1*$BNSZ`(r5)                
932         $UMULL  r8,r6,r7
933         $UMULH  r9,r6,r7
934         addc    r11,r8,r11
935         adde    r12,r9,r12
936         addze   r10,r0
937                                         #mul_add_c(a[2],b[2],c2,c3,c1);
938         $LD     r6,`2*$BNSZ`(r4)
939         $LD     r7,`2*$BNSZ`(r5)
940         $UMULL  r8,r6,r7
941         $UMULH  r9,r6,r7
942         addc    r11,r8,r11
943         adde    r12,r9,r12
944         addze   r10,r10
945                                         #mul_add_c(a[1],b[3],c2,c3,c1);
946         $LD     r6,`1*$BNSZ`(r4)
947         $LD     r7,`3*$BNSZ`(r5)
948         $UMULL  r8,r6,r7
949         $UMULH  r9,r6,r7
950         addc    r11,r8,r11
951         adde    r12,r9,r12
952         addze   r10,r10
953         $ST     r11,`4*$BNSZ`(r3)       #r[4]=c2
954                                         #mul_add_c(a[2],b[3],c3,c1,c2);
955         $LD     r6,`2*$BNSZ`(r4)                
956         $UMULL  r8,r6,r7
957         $UMULH  r9,r6,r7
958         addc    r12,r8,r12
959         adde    r10,r9,r10
960         addze   r11,r0
961                                         #mul_add_c(a[3],b[2],c3,c1,c2);
962         $LD     r6,`3*$BNSZ`(r4)
963         $LD     r7,`2*$BNSZ`(r5)
964         $UMULL  r8,r6,r7
965         $UMULH  r9,r6,r7
966         addc    r12,r8,r12
967         adde    r10,r9,r10
968         addze   r11,r11
969         $ST     r12,`5*$BNSZ`(r3)       #r[5]=c3
970                                         #mul_add_c(a[3],b[3],c1,c2,c3);
971         $LD     r7,`3*$BNSZ`(r5)                
972         $UMULL  r8,r6,r7
973         $UMULH  r9,r6,r7
974         addc    r10,r8,r10
975         adde    r11,r9,r11
976
977         $ST     r10,`6*$BNSZ`(r3)       #r[6]=c1
978         $ST     r11,`7*$BNSZ`(r3)       #r[7]=c2
979         blr
980         .long   0
981         .byte   0,12,0x14,0,0,0,3,0
982         .long   0
983 .size   .bn_mul_comba4,.-.bn_mul_comba4
984
985 #
986 #       NOTE:   The following label name should be changed to
987 #               "bn_mul_comba8" i.e. remove the first dot
988 #               for the gcc compiler. This should be automatically
989 #               done in the build
990 #
991         
992 .align  4
993 .bn_mul_comba8:
994 #
995 # Optimized version of the bn_mul_comba8 routine.
996 #
997 # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
998 # r3 contains r
999 # r4 contains a
1000 # r5 contains b
1001 # r6, r7 are the 2 BN_ULONGs being multiplied.
1002 # r8, r9 are the results of the 32x32 giving 64 multiply.
1003 # r10, r11, r12 are the equivalents of c1, c2, and c3.
1004 #
1005         xor     r0,r0,r0                #r0=0. Used in addze below.
1006         
1007                                         #mul_add_c(a[0],b[0],c1,c2,c3);
1008         $LD     r6,`0*$BNSZ`(r4)        #a[0]
1009         $LD     r7,`0*$BNSZ`(r5)        #b[0]
1010         $UMULL  r10,r6,r7
1011         $UMULH  r11,r6,r7
1012         $ST     r10,`0*$BNSZ`(r3)       #r[0]=c1;
1013                                         #mul_add_c(a[0],b[1],c2,c3,c1);
1014         $LD     r7,`1*$BNSZ`(r5)
1015         $UMULL  r8,r6,r7
1016         $UMULH  r9,r6,r7
1017         addc    r11,r11,r8
1018         addze   r12,r9                  # since we didn't set r12 to zero before.
1019         addze   r10,r0
1020                                         #mul_add_c(a[1],b[0],c2,c3,c1);
1021         $LD     r6,`1*$BNSZ`(r4)
1022         $LD     r7,`0*$BNSZ`(r5)
1023         $UMULL  r8,r6,r7
1024         $UMULH  r9,r6,r7
1025         addc    r11,r11,r8
1026         adde    r12,r12,r9
1027         addze   r10,r10
1028         $ST     r11,`1*$BNSZ`(r3)       #r[1]=c2;
1029                                         #mul_add_c(a[2],b[0],c3,c1,c2);
1030         $LD     r6,`2*$BNSZ`(r4)
1031         $UMULL  r8,r6,r7
1032         $UMULH  r9,r6,r7
1033         addc    r12,r12,r8
1034         adde    r10,r10,r9
1035         addze   r11,r0
1036                                         #mul_add_c(a[1],b[1],c3,c1,c2);
1037         $LD     r6,`1*$BNSZ`(r4)
1038         $LD     r7,`1*$BNSZ`(r5)
1039         $UMULL  r8,r6,r7
1040         $UMULH  r9,r6,r7
1041         addc    r12,r12,r8
1042         adde    r10,r10,r9
1043         addze   r11,r11
1044                                         #mul_add_c(a[0],b[2],c3,c1,c2);
1045         $LD     r6,`0*$BNSZ`(r4)
1046         $LD     r7,`2*$BNSZ`(r5)
1047         $UMULL  r8,r6,r7
1048         $UMULH  r9,r6,r7
1049         addc    r12,r12,r8
1050         adde    r10,r10,r9
1051         addze   r11,r11
1052         $ST     r12,`2*$BNSZ`(r3)       #r[2]=c3;
1053                                         #mul_add_c(a[0],b[3],c1,c2,c3);
1054         $LD     r7,`3*$BNSZ`(r5)
1055         $UMULL  r8,r6,r7
1056         $UMULH  r9,r6,r7
1057         addc    r10,r10,r8
1058         adde    r11,r11,r9
1059         addze   r12,r0
1060                                         #mul_add_c(a[1],b[2],c1,c2,c3);
1061         $LD     r6,`1*$BNSZ`(r4)
1062         $LD     r7,`2*$BNSZ`(r5)
1063         $UMULL  r8,r6,r7
1064         $UMULH  r9,r6,r7
1065         addc    r10,r10,r8
1066         adde    r11,r11,r9
1067         addze   r12,r12
1068                 
1069                                         #mul_add_c(a[2],b[1],c1,c2,c3);
1070         $LD     r6,`2*$BNSZ`(r4)
1071         $LD     r7,`1*$BNSZ`(r5)
1072         $UMULL  r8,r6,r7
1073         $UMULH  r9,r6,r7
1074         addc    r10,r10,r8
1075         adde    r11,r11,r9
1076         addze   r12,r12
1077                                         #mul_add_c(a[3],b[0],c1,c2,c3);
1078         $LD     r6,`3*$BNSZ`(r4)
1079         $LD     r7,`0*$BNSZ`(r5)
1080         $UMULL  r8,r6,r7
1081         $UMULH  r9,r6,r7
1082         addc    r10,r10,r8
1083         adde    r11,r11,r9
1084         addze   r12,r12
1085         $ST     r10,`3*$BNSZ`(r3)       #r[3]=c1;
1086                                         #mul_add_c(a[4],b[0],c2,c3,c1);
1087         $LD     r6,`4*$BNSZ`(r4)
1088         $UMULL  r8,r6,r7
1089         $UMULH  r9,r6,r7
1090         addc    r11,r11,r8
1091         adde    r12,r12,r9
1092         addze   r10,r0
1093                                         #mul_add_c(a[3],b[1],c2,c3,c1);
1094         $LD     r6,`3*$BNSZ`(r4)
1095         $LD     r7,`1*$BNSZ`(r5)
1096         $UMULL  r8,r6,r7
1097         $UMULH  r9,r6,r7
1098         addc    r11,r11,r8
1099         adde    r12,r12,r9
1100         addze   r10,r10
1101                                         #mul_add_c(a[2],b[2],c2,c3,c1);
1102         $LD     r6,`2*$BNSZ`(r4)
1103         $LD     r7,`2*$BNSZ`(r5)
1104         $UMULL  r8,r6,r7
1105         $UMULH  r9,r6,r7
1106         addc    r11,r11,r8
1107         adde    r12,r12,r9
1108         addze   r10,r10
1109                                         #mul_add_c(a[1],b[3],c2,c3,c1);
1110         $LD     r6,`1*$BNSZ`(r4)
1111         $LD     r7,`3*$BNSZ`(r5)
1112         $UMULL  r8,r6,r7
1113         $UMULH  r9,r6,r7
1114         addc    r11,r11,r8
1115         adde    r12,r12,r9
1116         addze   r10,r10
1117                                         #mul_add_c(a[0],b[4],c2,c3,c1);
1118         $LD     r6,`0*$BNSZ`(r4)
1119         $LD     r7,`4*$BNSZ`(r5)
1120         $UMULL  r8,r6,r7
1121         $UMULH  r9,r6,r7
1122         addc    r11,r11,r8
1123         adde    r12,r12,r9
1124         addze   r10,r10
1125         $ST     r11,`4*$BNSZ`(r3)       #r[4]=c2;
1126                                         #mul_add_c(a[0],b[5],c3,c1,c2);
1127         $LD     r7,`5*$BNSZ`(r5)
1128         $UMULL  r8,r6,r7
1129         $UMULH  r9,r6,r7
1130         addc    r12,r12,r8
1131         adde    r10,r10,r9
1132         addze   r11,r0
1133                                         #mul_add_c(a[1],b[4],c3,c1,c2);
1134         $LD     r6,`1*$BNSZ`(r4)                
1135         $LD     r7,`4*$BNSZ`(r5)
1136         $UMULL  r8,r6,r7
1137         $UMULH  r9,r6,r7
1138         addc    r12,r12,r8
1139         adde    r10,r10,r9
1140         addze   r11,r11
1141                                         #mul_add_c(a[2],b[3],c3,c1,c2);
1142         $LD     r6,`2*$BNSZ`(r4)                
1143         $LD     r7,`3*$BNSZ`(r5)
1144         $UMULL  r8,r6,r7
1145         $UMULH  r9,r6,r7
1146         addc    r12,r12,r8
1147         adde    r10,r10,r9
1148         addze   r11,r11
1149                                         #mul_add_c(a[3],b[2],c3,c1,c2);
1150         $LD     r6,`3*$BNSZ`(r4)                
1151         $LD     r7,`2*$BNSZ`(r5)
1152         $UMULL  r8,r6,r7
1153         $UMULH  r9,r6,r7
1154         addc    r12,r12,r8
1155         adde    r10,r10,r9
1156         addze   r11,r11
1157                                         #mul_add_c(a[4],b[1],c3,c1,c2);
1158         $LD     r6,`4*$BNSZ`(r4)                
1159         $LD     r7,`1*$BNSZ`(r5)
1160         $UMULL  r8,r6,r7
1161         $UMULH  r9,r6,r7
1162         addc    r12,r12,r8
1163         adde    r10,r10,r9
1164         addze   r11,r11
1165                                         #mul_add_c(a[5],b[0],c3,c1,c2);
1166         $LD     r6,`5*$BNSZ`(r4)                
1167         $LD     r7,`0*$BNSZ`(r5)
1168         $UMULL  r8,r6,r7
1169         $UMULH  r9,r6,r7
1170         addc    r12,r12,r8
1171         adde    r10,r10,r9
1172         addze   r11,r11
1173         $ST     r12,`5*$BNSZ`(r3)       #r[5]=c3;
1174                                         #mul_add_c(a[6],b[0],c1,c2,c3);
1175         $LD     r6,`6*$BNSZ`(r4)
1176         $UMULL  r8,r6,r7
1177         $UMULH  r9,r6,r7
1178         addc    r10,r10,r8
1179         adde    r11,r11,r9
1180         addze   r12,r0
1181                                         #mul_add_c(a[5],b[1],c1,c2,c3);
1182         $LD     r6,`5*$BNSZ`(r4)
1183         $LD     r7,`1*$BNSZ`(r5)
1184         $UMULL  r8,r6,r7
1185         $UMULH  r9,r6,r7
1186         addc    r10,r10,r8
1187         adde    r11,r11,r9
1188         addze   r12,r12
1189                                         #mul_add_c(a[4],b[2],c1,c2,c3);
1190         $LD     r6,`4*$BNSZ`(r4)
1191         $LD     r7,`2*$BNSZ`(r5)
1192         $UMULL  r8,r6,r7
1193         $UMULH  r9,r6,r7
1194         addc    r10,r10,r8
1195         adde    r11,r11,r9
1196         addze   r12,r12
1197                                         #mul_add_c(a[3],b[3],c1,c2,c3);
1198         $LD     r6,`3*$BNSZ`(r4)
1199         $LD     r7,`3*$BNSZ`(r5)
1200         $UMULL  r8,r6,r7
1201         $UMULH  r9,r6,r7
1202         addc    r10,r10,r8
1203         adde    r11,r11,r9
1204         addze   r12,r12
1205                                         #mul_add_c(a[2],b[4],c1,c2,c3);
1206         $LD     r6,`2*$BNSZ`(r4)
1207         $LD     r7,`4*$BNSZ`(r5)
1208         $UMULL  r8,r6,r7
1209         $UMULH  r9,r6,r7
1210         addc    r10,r10,r8
1211         adde    r11,r11,r9
1212         addze   r12,r12
1213                                         #mul_add_c(a[1],b[5],c1,c2,c3);
1214         $LD     r6,`1*$BNSZ`(r4)
1215         $LD     r7,`5*$BNSZ`(r5)
1216         $UMULL  r8,r6,r7
1217         $UMULH  r9,r6,r7
1218         addc    r10,r10,r8
1219         adde    r11,r11,r9
1220         addze   r12,r12
1221                                         #mul_add_c(a[0],b[6],c1,c2,c3);
1222         $LD     r6,`0*$BNSZ`(r4)
1223         $LD     r7,`6*$BNSZ`(r5)
1224         $UMULL  r8,r6,r7
1225         $UMULH  r9,r6,r7
1226         addc    r10,r10,r8
1227         adde    r11,r11,r9
1228         addze   r12,r12
1229         $ST     r10,`6*$BNSZ`(r3)       #r[6]=c1;
1230                                         #mul_add_c(a[0],b[7],c2,c3,c1);
1231         $LD     r7,`7*$BNSZ`(r5)
1232         $UMULL  r8,r6,r7
1233         $UMULH  r9,r6,r7
1234         addc    r11,r11,r8
1235         adde    r12,r12,r9
1236         addze   r10,r0
1237                                         #mul_add_c(a[1],b[6],c2,c3,c1);
1238         $LD     r6,`1*$BNSZ`(r4)
1239         $LD     r7,`6*$BNSZ`(r5)
1240         $UMULL  r8,r6,r7
1241         $UMULH  r9,r6,r7
1242         addc    r11,r11,r8
1243         adde    r12,r12,r9
1244         addze   r10,r10
1245                                         #mul_add_c(a[2],b[5],c2,c3,c1);
1246         $LD     r6,`2*$BNSZ`(r4)
1247         $LD     r7,`5*$BNSZ`(r5)
1248         $UMULL  r8,r6,r7
1249         $UMULH  r9,r6,r7
1250         addc    r11,r11,r8
1251         adde    r12,r12,r9
1252         addze   r10,r10
1253                                         #mul_add_c(a[3],b[4],c2,c3,c1);
1254         $LD     r6,`3*$BNSZ`(r4)
1255         $LD     r7,`4*$BNSZ`(r5)
1256         $UMULL  r8,r6,r7
1257         $UMULH  r9,r6,r7
1258         addc    r11,r11,r8
1259         adde    r12,r12,r9
1260         addze   r10,r10
1261                                         #mul_add_c(a[4],b[3],c2,c3,c1);
1262         $LD     r6,`4*$BNSZ`(r4)
1263         $LD     r7,`3*$BNSZ`(r5)
1264         $UMULL  r8,r6,r7
1265         $UMULH  r9,r6,r7
1266         addc    r11,r11,r8
1267         adde    r12,r12,r9
1268         addze   r10,r10
1269                                         #mul_add_c(a[5],b[2],c2,c3,c1);
1270         $LD     r6,`5*$BNSZ`(r4)
1271         $LD     r7,`2*$BNSZ`(r5)
1272         $UMULL  r8,r6,r7
1273         $UMULH  r9,r6,r7
1274         addc    r11,r11,r8
1275         adde    r12,r12,r9
1276         addze   r10,r10
1277                                         #mul_add_c(a[6],b[1],c2,c3,c1);
1278         $LD     r6,`6*$BNSZ`(r4)
1279         $LD     r7,`1*$BNSZ`(r5)
1280         $UMULL  r8,r6,r7
1281         $UMULH  r9,r6,r7
1282         addc    r11,r11,r8
1283         adde    r12,r12,r9
1284         addze   r10,r10
1285                                         #mul_add_c(a[7],b[0],c2,c3,c1);
1286         $LD     r6,`7*$BNSZ`(r4)
1287         $LD     r7,`0*$BNSZ`(r5)
1288         $UMULL  r8,r6,r7
1289         $UMULH  r9,r6,r7
1290         addc    r11,r11,r8
1291         adde    r12,r12,r9
1292         addze   r10,r10
1293         $ST     r11,`7*$BNSZ`(r3)       #r[7]=c2;
1294                                         #mul_add_c(a[7],b[1],c3,c1,c2);
1295         $LD     r7,`1*$BNSZ`(r5)
1296         $UMULL  r8,r6,r7
1297         $UMULH  r9,r6,r7
1298         addc    r12,r12,r8
1299         adde    r10,r10,r9
1300         addze   r11,r0
1301                                         #mul_add_c(a[6],b[2],c3,c1,c2);
1302         $LD     r6,`6*$BNSZ`(r4)
1303         $LD     r7,`2*$BNSZ`(r5)
1304         $UMULL  r8,r6,r7
1305         $UMULH  r9,r6,r7
1306         addc    r12,r12,r8
1307         adde    r10,r10,r9
1308         addze   r11,r11
1309                                         #mul_add_c(a[5],b[3],c3,c1,c2);
1310         $LD     r6,`5*$BNSZ`(r4)
1311         $LD     r7,`3*$BNSZ`(r5)
1312         $UMULL  r8,r6,r7
1313         $UMULH  r9,r6,r7
1314         addc    r12,r12,r8
1315         adde    r10,r10,r9
1316         addze   r11,r11
1317                                         #mul_add_c(a[4],b[4],c3,c1,c2);
1318         $LD     r6,`4*$BNSZ`(r4)
1319         $LD     r7,`4*$BNSZ`(r5)
1320         $UMULL  r8,r6,r7
1321         $UMULH  r9,r6,r7
1322         addc    r12,r12,r8
1323         adde    r10,r10,r9
1324         addze   r11,r11
1325                                         #mul_add_c(a[3],b[5],c3,c1,c2);
1326         $LD     r6,`3*$BNSZ`(r4)
1327         $LD     r7,`5*$BNSZ`(r5)
1328         $UMULL  r8,r6,r7
1329         $UMULH  r9,r6,r7
1330         addc    r12,r12,r8
1331         adde    r10,r10,r9
1332         addze   r11,r11
1333                                         #mul_add_c(a[2],b[6],c3,c1,c2);
1334         $LD     r6,`2*$BNSZ`(r4)
1335         $LD     r7,`6*$BNSZ`(r5)
1336         $UMULL  r8,r6,r7
1337         $UMULH  r9,r6,r7
1338         addc    r12,r12,r8
1339         adde    r10,r10,r9
1340         addze   r11,r11
1341                                         #mul_add_c(a[1],b[7],c3,c1,c2);
1342         $LD     r6,`1*$BNSZ`(r4)
1343         $LD     r7,`7*$BNSZ`(r5)
1344         $UMULL  r8,r6,r7
1345         $UMULH  r9,r6,r7
1346         addc    r12,r12,r8
1347         adde    r10,r10,r9
1348         addze   r11,r11
1349         $ST     r12,`8*$BNSZ`(r3)       #r[8]=c3;
1350                                         #mul_add_c(a[2],b[7],c1,c2,c3);
1351         $LD     r6,`2*$BNSZ`(r4)
1352         $UMULL  r8,r6,r7
1353         $UMULH  r9,r6,r7
1354         addc    r10,r10,r8
1355         adde    r11,r11,r9
1356         addze   r12,r0
1357                                         #mul_add_c(a[3],b[6],c1,c2,c3);
1358         $LD     r6,`3*$BNSZ`(r4)
1359         $LD     r7,`6*$BNSZ`(r5)
1360         $UMULL  r8,r6,r7
1361         $UMULH  r9,r6,r7
1362         addc    r10,r10,r8
1363         adde    r11,r11,r9
1364         addze   r12,r12
1365                                         #mul_add_c(a[4],b[5],c1,c2,c3);
1366         $LD     r6,`4*$BNSZ`(r4)
1367         $LD     r7,`5*$BNSZ`(r5)
1368         $UMULL  r8,r6,r7
1369         $UMULH  r9,r6,r7
1370         addc    r10,r10,r8
1371         adde    r11,r11,r9
1372         addze   r12,r12
1373                                         #mul_add_c(a[5],b[4],c1,c2,c3);
1374         $LD     r6,`5*$BNSZ`(r4)
1375         $LD     r7,`4*$BNSZ`(r5)
1376         $UMULL  r8,r6,r7
1377         $UMULH  r9,r6,r7
1378         addc    r10,r10,r8
1379         adde    r11,r11,r9
1380         addze   r12,r12
1381                                         #mul_add_c(a[6],b[3],c1,c2,c3);
1382         $LD     r6,`6*$BNSZ`(r4)
1383         $LD     r7,`3*$BNSZ`(r5)
1384         $UMULL  r8,r6,r7
1385         $UMULH  r9,r6,r7
1386         addc    r10,r10,r8
1387         adde    r11,r11,r9
1388         addze   r12,r12
1389                                         #mul_add_c(a[7],b[2],c1,c2,c3);
1390         $LD     r6,`7*$BNSZ`(r4)
1391         $LD     r7,`2*$BNSZ`(r5)
1392         $UMULL  r8,r6,r7
1393         $UMULH  r9,r6,r7
1394         addc    r10,r10,r8
1395         adde    r11,r11,r9
1396         addze   r12,r12
1397         $ST     r10,`9*$BNSZ`(r3)       #r[9]=c1;
1398                                         #mul_add_c(a[7],b[3],c2,c3,c1);
1399         $LD     r7,`3*$BNSZ`(r5)
1400         $UMULL  r8,r6,r7
1401         $UMULH  r9,r6,r7
1402         addc    r11,r11,r8
1403         adde    r12,r12,r9
1404         addze   r10,r0
1405                                         #mul_add_c(a[6],b[4],c2,c3,c1);
1406         $LD     r6,`6*$BNSZ`(r4)
1407         $LD     r7,`4*$BNSZ`(r5)
1408         $UMULL  r8,r6,r7
1409         $UMULH  r9,r6,r7
1410         addc    r11,r11,r8
1411         adde    r12,r12,r9
1412         addze   r10,r10
1413                                         #mul_add_c(a[5],b[5],c2,c3,c1);
1414         $LD     r6,`5*$BNSZ`(r4)
1415         $LD     r7,`5*$BNSZ`(r5)
1416         $UMULL  r8,r6,r7
1417         $UMULH  r9,r6,r7
1418         addc    r11,r11,r8
1419         adde    r12,r12,r9
1420         addze   r10,r10
1421                                         #mul_add_c(a[4],b[6],c2,c3,c1);
1422         $LD     r6,`4*$BNSZ`(r4)
1423         $LD     r7,`6*$BNSZ`(r5)
1424         $UMULL  r8,r6,r7
1425         $UMULH  r9,r6,r7
1426         addc    r11,r11,r8
1427         adde    r12,r12,r9
1428         addze   r10,r10
1429                                         #mul_add_c(a[3],b[7],c2,c3,c1);
1430         $LD     r6,`3*$BNSZ`(r4)
1431         $LD     r7,`7*$BNSZ`(r5)
1432         $UMULL  r8,r6,r7
1433         $UMULH  r9,r6,r7
1434         addc    r11,r11,r8
1435         adde    r12,r12,r9
1436         addze   r10,r10
1437         $ST     r11,`10*$BNSZ`(r3)      #r[10]=c2;
1438                                         #mul_add_c(a[4],b[7],c3,c1,c2);
1439         $LD     r6,`4*$BNSZ`(r4)
1440         $UMULL  r8,r6,r7
1441         $UMULH  r9,r6,r7
1442         addc    r12,r12,r8
1443         adde    r10,r10,r9
1444         addze   r11,r0
1445                                         #mul_add_c(a[5],b[6],c3,c1,c2);
1446         $LD     r6,`5*$BNSZ`(r4)
1447         $LD     r7,`6*$BNSZ`(r5)
1448         $UMULL  r8,r6,r7
1449         $UMULH  r9,r6,r7
1450         addc    r12,r12,r8
1451         adde    r10,r10,r9
1452         addze   r11,r11
1453                                         #mul_add_c(a[6],b[5],c3,c1,c2);
1454         $LD     r6,`6*$BNSZ`(r4)
1455         $LD     r7,`5*$BNSZ`(r5)
1456         $UMULL  r8,r6,r7
1457         $UMULH  r9,r6,r7
1458         addc    r12,r12,r8
1459         adde    r10,r10,r9
1460         addze   r11,r11
1461                                         #mul_add_c(a[7],b[4],c3,c1,c2);
1462         $LD     r6,`7*$BNSZ`(r4)
1463         $LD     r7,`4*$BNSZ`(r5)
1464         $UMULL  r8,r6,r7
1465         $UMULH  r9,r6,r7
1466         addc    r12,r12,r8
1467         adde    r10,r10,r9
1468         addze   r11,r11
1469         $ST     r12,`11*$BNSZ`(r3)      #r[11]=c3;
1470                                         #mul_add_c(a[7],b[5],c1,c2,c3);
1471         $LD     r7,`5*$BNSZ`(r5)
1472         $UMULL  r8,r6,r7
1473         $UMULH  r9,r6,r7
1474         addc    r10,r10,r8
1475         adde    r11,r11,r9
1476         addze   r12,r0
1477                                         #mul_add_c(a[6],b[6],c1,c2,c3);
1478         $LD     r6,`6*$BNSZ`(r4)
1479         $LD     r7,`6*$BNSZ`(r5)
1480         $UMULL  r8,r6,r7
1481         $UMULH  r9,r6,r7
1482         addc    r10,r10,r8
1483         adde    r11,r11,r9
1484         addze   r12,r12
1485                                         #mul_add_c(a[5],b[7],c1,c2,c3);
1486         $LD     r6,`5*$BNSZ`(r4)
1487         $LD     r7,`7*$BNSZ`(r5)
1488         $UMULL  r8,r6,r7
1489         $UMULH  r9,r6,r7
1490         addc    r10,r10,r8
1491         adde    r11,r11,r9
1492         addze   r12,r12
1493         $ST     r10,`12*$BNSZ`(r3)      #r[12]=c1;
1494                                         #mul_add_c(a[6],b[7],c2,c3,c1);
1495         $LD     r6,`6*$BNSZ`(r4)
1496         $UMULL  r8,r6,r7
1497         $UMULH  r9,r6,r7
1498         addc    r11,r11,r8
1499         adde    r12,r12,r9
1500         addze   r10,r0
1501                                         #mul_add_c(a[7],b[6],c2,c3,c1);
1502         $LD     r6,`7*$BNSZ`(r4)
1503         $LD     r7,`6*$BNSZ`(r5)
1504         $UMULL  r8,r6,r7
1505         $UMULH  r9,r6,r7
1506         addc    r11,r11,r8
1507         adde    r12,r12,r9
1508         addze   r10,r10
1509         $ST     r11,`13*$BNSZ`(r3)      #r[13]=c2;
1510                                         #mul_add_c(a[7],b[7],c3,c1,c2);
1511         $LD     r7,`7*$BNSZ`(r5)
1512         $UMULL  r8,r6,r7
1513         $UMULH  r9,r6,r7
1514         addc    r12,r12,r8
1515         adde    r10,r10,r9
1516         $ST     r12,`14*$BNSZ`(r3)      #r[14]=c3;
1517         $ST     r10,`15*$BNSZ`(r3)      #r[15]=c1;
1518         blr
1519         .long   0
1520         .byte   0,12,0x14,0,0,0,3,0
1521         .long   0
1522 .size   .bn_mul_comba8,.-.bn_mul_comba8
1523
1524 #
1525 #       NOTE:   The following label name should be changed to
1526 #               "bn_sub_words" i.e. remove the first dot
1527 #               for the gcc compiler. This should be automatically
1528 #               done in the build
1529 #
1530 #
1531 .align  4
1532 .bn_sub_words:
1533 #
1534 #       Handcoded version of bn_sub_words
1535 #
1536 #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1537 #
1538 #       r3 = r
1539 #       r4 = a
1540 #       r5 = b
1541 #       r6 = n
1542 #
1543 #       Note:   No loop unrolling done since this is not a performance
1544 #               critical loop.
1545
1546         xor     r0,r0,r0        #set r0 = 0
1547 #
1548 #       check for r6 = 0 AND set carry bit.
1549 #
1550         subfc.  r7,r0,r6        # If r6 is 0 then result is 0.
1551                                 # if r6 > 0 then result !=0
1552                                 # In either case carry bit is set.
1553         beq     Lppcasm_sub_adios
1554         addi    r4,r4,-$BNSZ
1555         addi    r3,r3,-$BNSZ
1556         addi    r5,r5,-$BNSZ
1557         mtctr   r6
1558 Lppcasm_sub_mainloop:   
1559         $LDU    r7,$BNSZ(r4)
1560         $LDU    r8,$BNSZ(r5)
1561         subfe   r6,r8,r7        # r6 = r7+carry bit + onescomplement(r8)
1562                                 # if carry = 1 this is r7-r8. Else it
1563                                 # is r7-r8 -1 as we need.
1564         $STU    r6,$BNSZ(r3)
1565         bdnz    Lppcasm_sub_mainloop
1566 Lppcasm_sub_adios:      
1567         subfze  r3,r0           # if carry bit is set then r3 = 0 else -1
1568         andi.   r3,r3,1         # keep only last bit.
1569         blr
1570         .long   0
1571         .byte   0,12,0x14,0,0,0,4,0
1572         .long   0
1573 .size   .bn_sub_words,.-.bn_sub_words
1574
1575 #
1576 #       NOTE:   The following label name should be changed to
1577 #               "bn_add_words" i.e. remove the first dot
1578 #               for the gcc compiler. This should be automatically
1579 #               done in the build
1580 #
1581
1582 .align  4
1583 .bn_add_words:
1584 #
1585 #       Handcoded version of bn_add_words
1586 #
1587 #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1588 #
1589 #       r3 = r
1590 #       r4 = a
1591 #       r5 = b
1592 #       r6 = n
1593 #
1594 #       Note:   No loop unrolling done since this is not a performance
1595 #               critical loop.
1596
1597         xor     r0,r0,r0
1598 #
1599 #       check for r6 = 0. Is this needed?
1600 #
1601         addic.  r6,r6,0         #test r6 and clear carry bit.
1602         beq     Lppcasm_add_adios
1603         addi    r4,r4,-$BNSZ
1604         addi    r3,r3,-$BNSZ
1605         addi    r5,r5,-$BNSZ
1606         mtctr   r6
1607 Lppcasm_add_mainloop:   
1608         $LDU    r7,$BNSZ(r4)
1609         $LDU    r8,$BNSZ(r5)
1610         adde    r8,r7,r8
1611         $STU    r8,$BNSZ(r3)
1612         bdnz    Lppcasm_add_mainloop
1613 Lppcasm_add_adios:      
1614         addze   r3,r0                   #return carry bit.
1615         blr
1616         .long   0
1617         .byte   0,12,0x14,0,0,0,4,0
1618         .long   0
1619 .size   .bn_add_words,.-.bn_add_words
1620
1621 #
1622 #       NOTE:   The following label name should be changed to
1623 #               "bn_div_words" i.e. remove the first dot
1624 #               for the gcc compiler. This should be automatically
1625 #               done in the build
1626 #
1627
1628 .align  4
1629 .bn_div_words:
1630 #
1631 #       This is a cleaned up version of code generated by
1632 #       the AIX compiler. The only optimization is to use
1633 #       the PPC instruction to count leading zeros instead
1634 #       of call to num_bits_word. Since this was compiled
1635 #       only at level -O2 we can possibly squeeze it more?
1636 #       
1637 #       r3 = h
1638 #       r4 = l
1639 #       r5 = d
1640         
1641         $UCMPI  0,r5,0                  # compare r5 and 0
1642         bne     Lppcasm_div1            # proceed if d!=0
1643         li      r3,-1                   # d=0 return -1
1644         blr
1645 Lppcasm_div1:
1646         xor     r0,r0,r0                #r0=0
1647         li      r8,$BITS
1648         $CNTLZ. r7,r5                   #r7 = num leading 0s in d.
1649         beq     Lppcasm_div2            #proceed if no leading zeros
1650         subf    r8,r7,r8                #r8 = BN_num_bits_word(d)
1651         $SHR.   r9,r3,r8                #are there any bits above r8'th?
1652         $TR     16,r9,r0                #if there're, signal to dump core...
1653 Lppcasm_div2:
1654         $UCMP   0,r3,r5                 #h>=d?
1655         blt     Lppcasm_div3            #goto Lppcasm_div3 if not
1656         subf    r3,r5,r3                #h-=d ; 
1657 Lppcasm_div3:                           #r7 = BN_BITS2-i. so r7=i
1658         cmpi    0,0,r7,0                # is (i == 0)?
1659         beq     Lppcasm_div4
1660         $SHL    r3,r3,r7                # h = (h<< i)
1661         $SHR    r8,r4,r8                # r8 = (l >> BN_BITS2 -i)
1662         $SHL    r5,r5,r7                # d<<=i
1663         or      r3,r3,r8                # h = (h<<i)|(l>>(BN_BITS2-i))
1664         $SHL    r4,r4,r7                # l <<=i
1665 Lppcasm_div4:
1666         $SHRI   r9,r5,`$BITS/2`         # r9 = dh
1667                                         # dl will be computed when needed
1668                                         # as it saves registers.
1669         li      r6,2                    #r6=2
1670         mtctr   r6                      #counter will be in count.
1671 Lppcasm_divouterloop: 
1672         $SHRI   r8,r3,`$BITS/2`         #r8 = (h>>BN_BITS4)
1673         $SHRI   r11,r4,`$BITS/2`        #r11= (l&BN_MASK2h)>>BN_BITS4
1674                                         # compute here for innerloop.
1675         $UCMP   0,r8,r9                 # is (h>>BN_BITS4)==dh
1676         bne     Lppcasm_div5            # goto Lppcasm_div5 if not
1677
1678         li      r8,-1
1679         $CLRU   r8,r8,`$BITS/2`         #q = BN_MASK2l 
1680         b       Lppcasm_div6
1681 Lppcasm_div5:
1682         $UDIV   r8,r3,r9                #q = h/dh
1683 Lppcasm_div6:
1684         $UMULL  r12,r9,r8               #th = q*dh
1685         $CLRU   r10,r5,`$BITS/2`        #r10=dl
1686         $UMULL  r6,r8,r10               #tl = q*dl
1687         
1688 Lppcasm_divinnerloop:
1689         subf    r10,r12,r3              #t = h -th
1690         $SHRI   r7,r10,`$BITS/2`        #r7= (t &BN_MASK2H), sort of...
1691         addic.  r7,r7,0                 #test if r7 == 0. used below.
1692                                         # now want to compute
1693                                         # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1694                                         # the following 2 instructions do that
1695         $SHLI   r7,r10,`$BITS/2`        # r7 = (t<<BN_BITS4)
1696         or      r7,r7,r11               # r7|=((l&BN_MASK2h)>>BN_BITS4)
1697         $UCMP   cr1,r6,r7               # compare (tl <= r7)
1698         bne     Lppcasm_divinnerexit
1699         ble     cr1,Lppcasm_divinnerexit
1700         addi    r8,r8,-1                #q--
1701         subf    r12,r9,r12              #th -=dh
1702         $CLRU   r10,r5,`$BITS/2`        #r10=dl. t is no longer needed in loop.
1703         subf    r6,r10,r6               #tl -=dl
1704         b       Lppcasm_divinnerloop
1705 Lppcasm_divinnerexit:
1706         $SHRI   r10,r6,`$BITS/2`        #t=(tl>>BN_BITS4)
1707         $SHLI   r11,r6,`$BITS/2`        #tl=(tl<<BN_BITS4)&BN_MASK2h;
1708         $UCMP   cr1,r4,r11              # compare l and tl
1709         add     r12,r12,r10             # th+=t
1710         bge     cr1,Lppcasm_div7        # if (l>=tl) goto Lppcasm_div7
1711         addi    r12,r12,1               # th++
1712 Lppcasm_div7:
1713         subf    r11,r11,r4              #r11=l-tl
1714         $UCMP   cr1,r3,r12              #compare h and th
1715         bge     cr1,Lppcasm_div8        #if (h>=th) goto Lppcasm_div8
1716         addi    r8,r8,-1                # q--
1717         add     r3,r5,r3                # h+=d
1718 Lppcasm_div8:
1719         subf    r12,r12,r3              #r12 = h-th
1720         $SHLI   r4,r11,`$BITS/2`        #l=(l&BN_MASK2l)<<BN_BITS4
1721                                         # want to compute
1722                                         # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1723                                         # the following 2 instructions will do this.
1724         $INSR   r11,r12,`$BITS/2`,`$BITS/2`     # r11 is the value we want rotated $BITS/2.
1725         $ROTL   r3,r11,`$BITS/2`        # rotate by $BITS/2 and store in r3
1726         bdz     Lppcasm_div9            #if (count==0) break ;
1727         $SHLI   r0,r8,`$BITS/2`         #ret =q<<BN_BITS4
1728         b       Lppcasm_divouterloop
1729 Lppcasm_div9:
1730         or      r3,r8,r0
1731         blr
1732         .long   0
1733         .byte   0,12,0x14,0,0,0,3,0
1734         .long   0
1735 .size   .bn_div_words,.-.bn_div_words
1736
1737 #
1738 #       NOTE:   The following label name should be changed to
1739 #               "bn_sqr_words" i.e. remove the first dot
1740 #               for the gcc compiler. This should be automatically
1741 #               done in the build
1742 #
1743 .align  4
1744 .bn_sqr_words:
1745 #
1746 #       Optimized version of bn_sqr_words
1747 #
1748 #       void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1749 #
1750 #       r3 = r
1751 #       r4 = a
1752 #       r5 = n
1753 #
1754 #       r6 = a[i].
1755 #       r7,r8 = product.
1756 #
1757 #       No unrolling done here. Not performance critical.
1758
1759         addic.  r5,r5,0                 #test r5.
1760         beq     Lppcasm_sqr_adios
1761         addi    r4,r4,-$BNSZ
1762         addi    r3,r3,-$BNSZ
1763         mtctr   r5
1764 Lppcasm_sqr_mainloop:   
1765                                         #sqr(r[0],r[1],a[0]);
1766         $LDU    r6,$BNSZ(r4)
1767         $UMULL  r7,r6,r6
1768         $UMULH  r8,r6,r6
1769         $STU    r7,$BNSZ(r3)
1770         $STU    r8,$BNSZ(r3)
1771         bdnz    Lppcasm_sqr_mainloop
1772 Lppcasm_sqr_adios:      
1773         blr
1774         .long   0
1775         .byte   0,12,0x14,0,0,0,3,0
1776         .long   0
1777 .size   .bn_sqr_words,.-.bn_sqr_words
1778
1779 #
1780 #       NOTE:   The following label name should be changed to
1781 #               "bn_mul_words" i.e. remove the first dot
1782 #               for the gcc compiler. This should be automatically
1783 #               done in the build
1784 #
1785
1786 .align  4       
1787 .bn_mul_words:
1788 #
1789 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1790 #
1791 # r3 = rp
1792 # r4 = ap
1793 # r5 = num
1794 # r6 = w
1795         xor     r0,r0,r0
1796         xor     r12,r12,r12             # used for carry
1797         rlwinm. r7,r5,30,2,31           # num >> 2
1798         beq     Lppcasm_mw_REM
1799         mtctr   r7
1800 Lppcasm_mw_LOOP:        
1801                                         #mul(rp[0],ap[0],w,c1);
1802         $LD     r8,`0*$BNSZ`(r4)
1803         $UMULL  r9,r6,r8
1804         $UMULH  r10,r6,r8
1805         addc    r9,r9,r12
1806         #addze  r10,r10                 #carry is NOT ignored.
1807                                         #will be taken care of
1808                                         #in second spin below
1809                                         #using adde.
1810         $ST     r9,`0*$BNSZ`(r3)
1811                                         #mul(rp[1],ap[1],w,c1);
1812         $LD     r8,`1*$BNSZ`(r4)        
1813         $UMULL  r11,r6,r8
1814         $UMULH  r12,r6,r8
1815         adde    r11,r11,r10
1816         #addze  r12,r12
1817         $ST     r11,`1*$BNSZ`(r3)
1818                                         #mul(rp[2],ap[2],w,c1);
1819         $LD     r8,`2*$BNSZ`(r4)
1820         $UMULL  r9,r6,r8
1821         $UMULH  r10,r6,r8
1822         adde    r9,r9,r12
1823         #addze  r10,r10
1824         $ST     r9,`2*$BNSZ`(r3)
1825                                         #mul_add(rp[3],ap[3],w,c1);
1826         $LD     r8,`3*$BNSZ`(r4)
1827         $UMULL  r11,r6,r8
1828         $UMULH  r12,r6,r8
1829         adde    r11,r11,r10
1830         addze   r12,r12                 #this spin we collect carry into
1831                                         #r12
1832         $ST     r11,`3*$BNSZ`(r3)
1833         
1834         addi    r3,r3,`4*$BNSZ`
1835         addi    r4,r4,`4*$BNSZ`
1836         bdnz    Lppcasm_mw_LOOP
1837
1838 Lppcasm_mw_REM:
1839         andi.   r5,r5,0x3
1840         beq     Lppcasm_mw_OVER
1841                                         #mul(rp[0],ap[0],w,c1);
1842         $LD     r8,`0*$BNSZ`(r4)
1843         $UMULL  r9,r6,r8
1844         $UMULH  r10,r6,r8
1845         addc    r9,r9,r12
1846         addze   r10,r10
1847         $ST     r9,`0*$BNSZ`(r3)
1848         addi    r12,r10,0
1849         
1850         addi    r5,r5,-1
1851         cmpli   0,0,r5,0
1852         beq     Lppcasm_mw_OVER
1853
1854         
1855                                         #mul(rp[1],ap[1],w,c1);
1856         $LD     r8,`1*$BNSZ`(r4)        
1857         $UMULL  r9,r6,r8
1858         $UMULH  r10,r6,r8
1859         addc    r9,r9,r12
1860         addze   r10,r10
1861         $ST     r9,`1*$BNSZ`(r3)
1862         addi    r12,r10,0
1863         
1864         addi    r5,r5,-1
1865         cmpli   0,0,r5,0
1866         beq     Lppcasm_mw_OVER
1867         
1868                                         #mul_add(rp[2],ap[2],w,c1);
1869         $LD     r8,`2*$BNSZ`(r4)
1870         $UMULL  r9,r6,r8
1871         $UMULH  r10,r6,r8
1872         addc    r9,r9,r12
1873         addze   r10,r10
1874         $ST     r9,`2*$BNSZ`(r3)
1875         addi    r12,r10,0
1876                 
1877 Lppcasm_mw_OVER:        
1878         addi    r3,r12,0
1879         blr
1880         .long   0
1881         .byte   0,12,0x14,0,0,0,4,0
1882         .long   0
1883 .size   .bn_mul_words,.-.bn_mul_words
1884
1885 #
1886 #       NOTE:   The following label name should be changed to
1887 #               "bn_mul_add_words" i.e. remove the first dot
1888 #               for the gcc compiler. This should be automatically
1889 #               done in the build
1890 #
1891
1892 .align  4
1893 .bn_mul_add_words:
1894 #
1895 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1896 #
1897 # r3 = rp
1898 # r4 = ap
1899 # r5 = num
1900 # r6 = w
1901 #
1902 # empirical evidence suggests that unrolled version performs best!!
1903 #
1904         xor     r0,r0,r0                #r0 = 0
1905         xor     r12,r12,r12             #r12 = 0 . used for carry               
1906         rlwinm. r7,r5,30,2,31           # num >> 2
1907         beq     Lppcasm_maw_leftover    # if (num < 4) go LPPCASM_maw_leftover
1908         mtctr   r7
1909 Lppcasm_maw_mainloop:   
1910                                         #mul_add(rp[0],ap[0],w,c1);
1911         $LD     r8,`0*$BNSZ`(r4)
1912         $LD     r11,`0*$BNSZ`(r3)
1913         $UMULL  r9,r6,r8
1914         $UMULH  r10,r6,r8
1915         addc    r9,r9,r12               #r12 is carry.
1916         addze   r10,r10
1917         addc    r9,r9,r11
1918         #addze  r10,r10
1919                                         #the above instruction addze
1920                                         #is NOT needed. Carry will NOT
1921                                         #be ignored. It's not affected
1922                                         #by multiply and will be collected
1923                                         #in the next spin
1924         $ST     r9,`0*$BNSZ`(r3)
1925         
1926                                         #mul_add(rp[1],ap[1],w,c1);
1927         $LD     r8,`1*$BNSZ`(r4)        
1928         $LD     r9,`1*$BNSZ`(r3)
1929         $UMULL  r11,r6,r8
1930         $UMULH  r12,r6,r8
1931         adde    r11,r11,r10             #r10 is carry.
1932         addze   r12,r12
1933         addc    r11,r11,r9
1934         #addze  r12,r12
1935         $ST     r11,`1*$BNSZ`(r3)
1936         
1937                                         #mul_add(rp[2],ap[2],w,c1);
1938         $LD     r8,`2*$BNSZ`(r4)
1939         $UMULL  r9,r6,r8
1940         $LD     r11,`2*$BNSZ`(r3)
1941         $UMULH  r10,r6,r8
1942         adde    r9,r9,r12
1943         addze   r10,r10
1944         addc    r9,r9,r11
1945         #addze  r10,r10
1946         $ST     r9,`2*$BNSZ`(r3)
1947         
1948                                         #mul_add(rp[3],ap[3],w,c1);
1949         $LD     r8,`3*$BNSZ`(r4)
1950         $UMULL  r11,r6,r8
1951         $LD     r9,`3*$BNSZ`(r3)
1952         $UMULH  r12,r6,r8
1953         adde    r11,r11,r10
1954         addze   r12,r12
1955         addc    r11,r11,r9
1956         addze   r12,r12
1957         $ST     r11,`3*$BNSZ`(r3)
1958         addi    r3,r3,`4*$BNSZ`
1959         addi    r4,r4,`4*$BNSZ`
1960         bdnz    Lppcasm_maw_mainloop
1961         
1962 Lppcasm_maw_leftover:
1963         andi.   r5,r5,0x3
1964         beq     Lppcasm_maw_adios
1965         addi    r3,r3,-$BNSZ
1966         addi    r4,r4,-$BNSZ
1967                                         #mul_add(rp[0],ap[0],w,c1);
1968         mtctr   r5
1969         $LDU    r8,$BNSZ(r4)
1970         $UMULL  r9,r6,r8
1971         $UMULH  r10,r6,r8
1972         $LDU    r11,$BNSZ(r3)
1973         addc    r9,r9,r11
1974         addze   r10,r10
1975         addc    r9,r9,r12
1976         addze   r12,r10
1977         $ST     r9,0(r3)
1978         
1979         bdz     Lppcasm_maw_adios
1980                                         #mul_add(rp[1],ap[1],w,c1);
1981         $LDU    r8,$BNSZ(r4)    
1982         $UMULL  r9,r6,r8
1983         $UMULH  r10,r6,r8
1984         $LDU    r11,$BNSZ(r3)
1985         addc    r9,r9,r11
1986         addze   r10,r10
1987         addc    r9,r9,r12
1988         addze   r12,r10
1989         $ST     r9,0(r3)
1990         
1991         bdz     Lppcasm_maw_adios
1992                                         #mul_add(rp[2],ap[2],w,c1);
1993         $LDU    r8,$BNSZ(r4)
1994         $UMULL  r9,r6,r8
1995         $UMULH  r10,r6,r8
1996         $LDU    r11,$BNSZ(r3)
1997         addc    r9,r9,r11
1998         addze   r10,r10
1999         addc    r9,r9,r12
2000         addze   r12,r10
2001         $ST     r9,0(r3)
2002                 
2003 Lppcasm_maw_adios:      
2004         addi    r3,r12,0
2005         blr
2006         .long   0
2007         .byte   0,12,0x14,0,0,0,4,0
2008         .long   0
2009 .size   .bn_mul_add_words,.-.bn_mul_add_words
2010         .align  4
2011 EOF
2012 $data =~ s/\`([^\`]*)\`/eval $1/gem;
2013 print $data;
2014 close STDOUT;