PA-RISC assembler: missing symbol and typos.
[openssl.git] / crypto / bn / asm / parisc-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # On PA-7100LC this module performs ~90-50% better, less for longer
11 # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
12 # that compiler utilized xmpyu instruction to perform 32x32=64-bit
13 # multiplication, which in turn means that "baseline" performance was
14 # optimal in respect to instruction set capabilities. Fair comparison
15 # with vendor compiler is problematic, because OpenSSL doesn't define
16 # BN_LLONG [presumably] for historical reasons, which drives compiler
17 # toward 4 times 16x16=32-bit multiplicatons [plus complementary
18 # shifts and additions] instead. This means that you should observe
19 # several times improvement over code generated by vendor compiler
20 # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
21 # improvement coefficient was never collected on PA-7100LC, or any
22 # other 1.1 CPU, because I don't have access to such machine with
23 # vendor compiler. But to give you a taste, PA-RISC 1.1 code-path
24 # reportedly outperformed code generated by cc +DA1.1 +O3 by factor
25 # of ~5x on PA-8600.
26 #
27 # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
28 # reportedly ~2x faster than vendor compiler generated code [see
29 # commentary in assembler source code]. Here comes a catch. Execution
30 # core of this implementation is actually 32-bit one, in the sense
31 # that it expects arrays of 32-bit BN_LONG values as input. But
32 # pa-risc2[W].s operates on arrays of 64-bit BN_LONGs... How do they
33 # interoperate then? Simple. This module picks halves of 64-bit
34 # values in reverse order. But can it compete with "pure" 64-bit code
35 # such as pa-risc2[W].s then? Well, the thing is that 32x32=64-bit
36 # multiplication is the best even PA-RISC 2.0 can do, i.e. there is
37 # no "wider" multiplication like on most other 64-bit platforms.
38 # This means that even being effectively 32-bit, this implementation
39 # performs the same computational task in same amount of arithmetic
40 # operations, most notably multiplications. It requires more memory
41 # references, most notably to tp[num], but this doesn't seem to
42 # exhaust memory port capacity. In other words this implementation,
43 # or more specifically its PA-RISC 2.0 code-path, competes with
44 # pa-risc2W.s on virtually same terms.
45 #
46 # In case it wasn't clear. The module has two distinct code-paths:
47 # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
48 # additions and 64-bit integer loads, not to mention specific
49 # instruction scheduling. In 32-bit build module imposes couple of
50 # limitations: vector lengths has to be even and vector addresses has
51 # to be 64-bit aligned. Normally neither is a problem: most common
52 # key lengths are even and vectors are commonly malloc-ed, which
53 # ensures 64-bit alignment.
54 #
55 # Special thanks to polarhome.com for providing HP-UX account.
56 \f
57 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58
59 $flavour = shift;
60 $output = shift;
61
62 open STDOUT,">$output";
63
64 if ($flavour =~ /64/) {
65         $LEVEL          ="2.0W";
66         $SIZE_T         =8;
67         $FRAME_MARKER   =80;
68         $SAVED_RP       =16;
69         $PUSH           ="std";
70         $PUSHMA         ="std,ma";
71         $POP            ="ldd";
72         $POPMB          ="ldd,mb";
73         $BN_SZ          =$SIZE_T;
74 } else {
75         $LEVEL          ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
76         $SIZE_T         =4;
77         $FRAME_MARKER   =48;
78         $SAVED_RP       =20;
79         $PUSH           ="stw";
80         $PUSHMA         ="stwm";
81         $POP            ="ldw";
82         $POPMB          ="ldwm";
83         $BN_SZ          =$SIZE_T;
84         if (open CONF,"<${dir}../../opensslconf.h") {
85             while(<CONF>) {
86                 if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
87                     $BN_SZ=8;
88                     $LEVEL="2.0";
89                     last;
90                 }
91             }
92             close CONF;
93         }
94 }
95
96 $FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
97                                 #                [+ argument transfer]
98 $LOCALS=$FRAME-$FRAME_MARKER;
99 $FRAME+=32;                     # local variables
100
101 $tp="%r31";
102 $ti1="%r29";
103 $ti0="%r28";
104
105 $rp="%r26";
106 $ap="%r25";
107 $bp="%r24";
108 $np="%r23";
109 $n0="%r22";     # passed through stack in 32-bit
110 $num="%r21";    # passed through stack in 32-bit
111 $idx="%r20";
112 $arrsz="%r19";
113
114 $nm1="%r7";
115 $nm0="%r6";
116 $ab1="%r5";
117 $ab0="%r4";
118
119 $fp="%r3";
120 $hi1="%r2";
121 $hi0="%r1";
122
123 $xfer=$n0;      # accomodates [-16..15] offset in fld[dw]s
124
125 $fm0="%fr4";    $fti=$fm0;
126 $fbi="%fr5L";
127 $fn0="%fr5R";
128 $fai="%fr6";    $fab0="%fr7";   $fab1="%fr8";
129 $fni="%fr9";    $fnm0="%fr10";  $fnm1="%fr11";
130
131 $code=<<___;
132         .LEVEL  $LEVEL
133         .SPACE  \$TEXT\$
134         .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
135
136         .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
137         .ALIGN  16
138 bn_mul_mont
139         .PROC
140         .CALLINFO       FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
141         .ENTRY
142         $PUSH   %r2,-$SAVED_RP(%sp)             ; standard prologue
143         $PUSHMA %r3,$FRAME(%sp)
144         $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
145         $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
146         $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
147         $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
148         $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
149         $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
150         $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
151         ldo     -$FRAME(%sp),$fp
152 ___
153 $code.=<<___ if ($SIZE_T==4);
154         ldw     `-$FRAME_MARKER-4`($fp),$n0
155         ldw     `-$FRAME_MARKER-8`($fp),$num
156         nop
157         nop                                     ; alignment
158 ___
159 $code.=<<___ if ($BN_SZ==4);
160         comiclr,<=      6,$num,%r0              ; are vectors long enough?
161         b               L\$abort
162         ldi             0,%r28                  ; signal "unhandled"
163         add,ev          %r0,$num,$num           ; is $num even?
164         b               L\$abort
165         nop
166         or              $ap,$np,$ti1
167         extru,=         $ti1,31,3,%r0           ; are ap and np 64-bit aligned?
168         b               L\$abort
169         nop
170         nop                                     ; alignment
171
172         fldws           0($n0),${fn0}
173         fldws,ma        4($bp),${fbi}           ; bp[0]
174 ___
175 $code.=<<___ if ($BN_SZ==8);
176         comib,>         3,$num,L\$abort         ; are vectors long enough?
177         ldi             0,%r28                  ; signal "unhandled"
178         addl            $num,$num,$num          ; I operate on 32-bit values
179
180         fldws           4($n0),${fn0}           ; only low part of n0
181         fldws           4($bp),${fbi}           ; bp[0] in flipped word order
182 ___
183 $code.=<<___;
184         fldds           0($ap),${fai}           ; ap[0,1]
185         fldds           0($np),${fni}           ; np[0,1]
186
187         sh2addl         $num,%r0,$arrsz
188         ldi             31,$hi0
189         ldo             36($arrsz),$hi1         ; space for tp[num+1]
190         andcm           $hi1,$hi0,$hi1          ; align
191         addl            $hi1,%sp,%sp
192         $PUSH           $fp,-$SIZE_T(%sp)
193
194         ldo             `$LOCALS+16`($fp),$xfer
195         ldo             `$LOCALS+32+4`($fp),$tp
196
197         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[0]*bp[0]
198         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[1]*bp[0]
199         xmpyu           ${fn0},${fab0}R,${fm0}
200
201         addl            $arrsz,$ap,$ap          ; point at the end
202         addl            $arrsz,$np,$np
203         subi            0,$arrsz,$idx           ; j=0
204         ldo             8($idx),$idx            ; j++++
205
206         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
207         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
208         fstds           ${fab0},-16($xfer)
209         fstds           ${fnm0},-8($xfer)
210         fstds           ${fab1},0($xfer)
211         fstds           ${fnm1},8($xfer)
212          flddx          $idx($ap),${fai}        ; ap[2,3]
213          flddx          $idx($np),${fni}        ; np[2,3]
214 ___
215 $code.=<<___ if ($BN_SZ==4);
216         mtctl           $hi0,%cr11              ; $hi0 still holds 31
217         extrd,u,*=      $hi0,%sar,1,$hi0        ; executes on PA-RISC 1.0
218         b               L\$parisc11
219         nop
220 ___
221 $code.=<<___;                                   # PA-RISC 2.0 code-path
222         ldd             -16($xfer),$ab0
223         ldd             -8($xfer),$nm0
224
225         extrd,u         $ab0,31,32,$hi0
226         extrd,u         $ab0,63,32,$ab0
227          ldo            8($idx),$idx            ; j++++
228          addl           $ab0,$nm0,$nm0          ; low part is discarded
229          extrd,u        $nm0,31,32,$hi1
230         ldd             0($xfer),$ab1
231 \f
232 L\$1st
233         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
234         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
235          ldd            8($xfer),$nm1
236         fstds           ${fab0},-16($xfer)
237         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
238         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
239         fstds           ${fnm0},-8($xfer)
240          addl           $hi0,$ab1,$ab1
241         fstds           ${fab1},0($xfer)
242          extrd,u        $ab1,31,32,$hi0
243         fstds           ${fnm1},8($xfer)
244          extrd,u        $ab1,63,32,$ab1
245          addl           $hi1,$nm1,$nm1
246         ldd             -16($xfer),$ab0
247          addl           $ab1,$nm1,$nm1
248         ldd             -8($xfer),$nm0
249          extrd,u        $nm1,31,32,$hi1
250
251         flddx           $idx($ap),${fai}        ; ap[j,j+1]
252          addl           $hi0,$ab0,$ab0
253         flddx           $idx($np),${fni}        ; np[j,j+1]
254          extrd,u        $ab0,31,32,$hi0
255         stw             $nm1,-4($tp)            ; tp[j-1]
256          extrd,u        $ab0,63,32,$ab0
257          addl           $hi1,$nm0,$nm0
258          addl           $ab0,$nm0,$nm0
259         ldd             0($xfer),$ab1
260          stw,ma         $nm0,8($tp)             ; tp[j-1]
261         addib,<>        8,$idx,L\$1st           ; j++++
262          extrd,u        $nm0,31,32,$hi1
263
264         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
265         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
266          ldd            8($xfer),$nm1
267         fstds           ${fab0},-16($xfer)
268         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
269         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
270         fstds           ${fnm0},-8($xfer)
271          addl           $hi0,$ab1,$ab1
272         fstds           ${fab1},0($xfer)
273          extrd,u        $ab1,31,32,$hi0
274         fstds           ${fnm1},8($xfer)
275          extrd,u        $ab1,63,32,$ab1
276          addl           $hi1,$nm1,$nm1
277         ldd             -16($xfer),$ab0
278          addl           $ab1,$nm1,$nm1
279         ldd             -8($xfer),$nm0
280          extrd,u        $nm1,31,32,$hi1
281
282          addl           $hi0,$ab0,$ab0
283          extrd,u        $ab0,31,32,$hi0
284         stw             $nm1,-4($tp)            ; tp[j-1]
285          extrd,u        $ab0,63,32,$ab0
286          addl           $hi1,$nm0,$nm0
287         ldd             0($xfer),$ab1
288          addl           $ab0,$nm0,$nm0
289         ldd,mb          8($xfer),$nm1
290          extrd,u        $nm0,31,32,$hi1
291         stw,ma          $nm0,8($tp)             ; tp[j-1]
292
293         ldo             -1($num),$num           ; i--
294         subi            0,$arrsz,$idx           ; j=0
295 ___
296 $code.=<<___ if ($BN_SZ==4);
297         fldws,ma        4($bp),${fbi}           ; bp[1]
298 ___
299 $code.=<<___ if ($BN_SZ==8);
300         fldws           0($bp),${fbi}           ; bp[1] in flipped word order
301 ___
302 $code.=<<___;
303          flddx          $idx($ap),${fai}        ; ap[0,1]
304          flddx          $idx($np),${fni}        ; np[0,1]
305          fldws          8($xfer),${fti}R        ; tp[0]
306         addl            $hi0,$ab1,$ab1
307          extrd,u        $ab1,31,32,$hi0
308          extrd,u        $ab1,63,32,$ab1
309          ldo            8($idx),$idx            ; j++++
310          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
311          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
312         addl            $hi1,$nm1,$nm1
313         addl            $ab1,$nm1,$nm1
314         extrd,u         $nm1,31,32,$hi1
315          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
316         stw             $nm1,-4($tp)            ; tp[j-1]
317
318          fcpy,sgl       %fr0,${fti}L            ; zero high part
319          fcpy,sgl       %fr0,${fab0}L
320         addl            $hi1,$hi0,$hi0
321         extrd,u         $hi0,31,32,$hi1
322          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
323          fcnvxf,dbl,dbl ${fab0},${fab0}
324         stw             $hi0,0($tp)
325         stw             $hi1,4($tp)
326
327         fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
328         fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
329         xmpyu           ${fn0},${fab0}R,${fm0}
330         ldo             `$LOCALS+32+4`($fp),$tp
331 L\$outer
332         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
333         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
334         fstds           ${fab0},-16($xfer)      ; 33-bit value
335         fstds           ${fnm0},-8($xfer)
336          flddx          $idx($ap),${fai}        ; ap[2]
337          flddx          $idx($np),${fni}        ; np[2]
338          ldo            8($idx),$idx            ; j++++
339         ldd             -16($xfer),$ab0         ; 33-bit value
340         ldd             -8($xfer),$nm0
341         ldw             0($xfer),$hi0           ; high part
342
343          extrd,u        $ab0,31,32,$ti0         ; carry bit
344          extrd,u        $ab0,63,32,$ab0
345         fstds           ${fab1},0($xfer)
346          addl           $ti0,$hi0,$hi0          ; account carry bit
347         fstds           ${fnm1},8($xfer)
348          addl           $ab0,$nm0,$nm0          ; low part is discarded
349         ldw             0($tp),$ti1             ; tp[1]
350          extrd,u        $nm0,31,32,$hi1
351         ldd             0($xfer),$ab1
352 \f
353 L\$inner
354         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
355         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
356          ldd            8($xfer),$nm1
357         fstds           ${fab0},-16($xfer)
358         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
359         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
360         fstds           ${fnm0},-8($xfer)
361         ldw             4($tp),$ti0             ; tp[j]
362          addl           $hi0,$ab1,$ab1
363         fstds           ${fab1},0($xfer)
364          addl           $ti1,$ab1,$ab1
365          extrd,u        $ab1,31,32,$hi0
366         fstds           ${fnm1},8($xfer)
367          extrd,u        $ab1,63,32,$ab1
368          addl           $hi1,$nm1,$nm1
369         ldd             -16($xfer),$ab0
370          addl           $ab1,$nm1,$nm1
371         ldd             -8($xfer),$nm0
372          extrd,u        $nm1,31,32,$hi1
373
374         flddx           $idx($ap),${fai}        ; ap[j,j+1]
375          addl           $hi0,$ab0,$ab0
376         flddx           $idx($np),${fni}        ; np[j,j+1]
377          addl           $ti0,$ab0,$ab0
378         stw             $nm1,-4($tp)            ; tp[j-1]
379          extrd,u        $ab0,31,32,$hi0
380         ldw             8($tp),$ti1             ; tp[j]
381          extrd,u        $ab0,63,32,$ab0
382          addl           $hi1,$nm0,$nm0
383          addl           $ab0,$nm0,$nm0
384         ldd             0($xfer),$ab1
385          stw,ma         $nm0,8($tp)             ; tp[j-1]
386         addib,<>        8,$idx,L\$inner         ; j++++
387          extrd,u        $nm0,31,32,$hi1
388
389         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
390         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
391          ldd            8($xfer),$nm1
392         fstds           ${fab0},-16($xfer)
393         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
394         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
395         fstds           ${fnm0},-8($xfer)
396         ldw             4($tp),$ti0             ; tp[j]
397          addl           $hi0,$ab1,$ab1
398         fstds           ${fab1},0($xfer)
399          addl           $ti1,$ab1,$ab1
400          extrd,u        $ab1,31,32,$hi0
401         fstds           ${fnm1},8($xfer)
402          extrd,u        $ab1,63,32,$ab1
403          addl           $hi1,$nm1,$nm1
404         ldd             -16($xfer),$ab0
405          addl           $ab1,$nm1,$nm1
406         ldd             -8($xfer),$nm0
407          extrd,u        $nm1,31,32,$hi1
408
409         addl            $hi0,$ab0,$ab0
410          addl           $ti0,$ab0,$ab0
411          stw            $nm1,-4($tp)            ; tp[j-1]
412          extrd,u        $ab0,31,32,$hi0
413         ldw             8($tp),$ti1             ; tp[j]
414          extrd,u        $ab0,63,32,$ab0
415          addl           $hi1,$nm0,$nm0
416         ldd             0($xfer),$ab1
417          addl           $ab0,$nm0,$nm0
418         ldd,mb          8($xfer),$nm1
419          extrd,u        $nm0,31,32,$hi1
420          stw,ma         $nm0,8($tp)             ; tp[j-1]
421
422         addib,=         -1,$num,L\$outerdone    ; i--
423         subi            0,$arrsz,$idx           ; j=0
424 ___
425 $code.=<<___ if ($BN_SZ==4);
426         fldws,ma        4($bp),${fbi}           ; bp[i]
427 ___
428 $code.=<<___ if ($BN_SZ==8);
429         ldi             12,$ti0                 ; bp[i] in flipped word order
430         addl,ev         %r0,$num,$num
431         ldi             -4,$ti0
432         addl            $ti0,$bp,$bp
433         fldws           0($bp),${fbi}
434 ___
435 $code.=<<___;
436          flddx          $idx($ap),${fai}        ; ap[0]
437         addl            $hi0,$ab1,$ab1
438          flddx          $idx($np),${fni}        ; np[0]
439          fldws          8($xfer),${fti}R        ; tp[0]
440         addl            $ti1,$ab1,$ab1
441         extrd,u         $ab1,31,32,$hi0
442         extrd,u         $ab1,63,32,$ab1
443
444          ldo            8($idx),$idx            ; j++++
445          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
446          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
447         ldw             4($tp),$ti0             ; tp[j]
448
449         addl            $hi1,$nm1,$nm1
450          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
451         addl            $ab1,$nm1,$nm1
452         extrd,u         $nm1,31,32,$hi1
453          fcpy,sgl       %fr0,${fti}L            ; zero high part
454          fcpy,sgl       %fr0,${fab0}L
455         stw             $nm1,-4($tp)            ; tp[j-1]
456
457          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
458          fcnvxf,dbl,dbl ${fab0},${fab0}
459         addl            $hi1,$hi0,$hi0
460          fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
461         addl            $ti0,$hi0,$hi0
462         extrd,u         $hi0,31,32,$hi1
463          fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
464         stw             $hi0,0($tp)
465         stw             $hi1,4($tp)
466          xmpyu          ${fn0},${fab0}R,${fm0}
467
468         b               L\$outer
469         ldo             `$LOCALS+32+4`($fp),$tp
470 \f
471 L\$outerdone
472         addl            $hi0,$ab1,$ab1
473         addl            $ti1,$ab1,$ab1
474         extrd,u         $ab1,31,32,$hi0
475         extrd,u         $ab1,63,32,$ab1
476
477         ldw             4($tp),$ti0             ; tp[j]
478
479         addl            $hi1,$nm1,$nm1
480         addl            $ab1,$nm1,$nm1
481         extrd,u         $nm1,31,32,$hi1
482         stw             $nm1,-4($tp)            ; tp[j-1]
483
484         addl            $hi1,$hi0,$hi0
485         addl            $ti0,$hi0,$hi0
486         extrd,u         $hi0,31,32,$hi1
487         stw             $hi0,0($tp)
488         stw             $hi1,4($tp)
489
490         ldo             `$LOCALS+32`($fp),$tp
491         sub             %r0,%r0,%r0             ; clear borrow
492 ___
493 $code.=<<___ if ($BN_SZ==4);
494         ldws,ma         4($tp),$ti0
495         extru,=         $rp,31,3,%r0            ; is rp 64-bit aligned?
496         b               L\$sub_pa11
497         addl            $tp,$arrsz,$tp
498 L\$sub
499         ldwx            $idx($np),$hi0
500         subb            $ti0,$hi0,$hi1
501         ldwx            $idx($tp),$ti0
502         addib,<>        4,$idx,L\$sub
503         stws,ma         $hi1,4($rp)
504
505         subb            $ti0,%r0,$hi1
506         ldo             -4($tp),$tp
507 ___
508 $code.=<<___ if ($BN_SZ==8);
509         ldd,ma          8($tp),$ti0
510 L\$sub
511         ldd             $idx($np),$hi0
512         shrpd           $ti0,$ti0,32,$ti0       ; flip word order
513         std             $ti0,-8($tp)            ; save flipped value
514         sub,db          $ti0,$hi0,$hi1
515         ldd,ma          8($tp),$ti0
516         addib,<>        8,$idx,L\$sub
517         std,ma          $hi1,8($rp)
518
519         extrd,u         $ti0,31,32,$ti0         ; carry in flipped word order
520         sub,db          $ti0,%r0,$hi1
521         ldo             -8($tp),$tp
522 ___
523 $code.=<<___;
524         and             $tp,$hi1,$ap
525         andcm           $rp,$hi1,$bp
526         or              $ap,$bp,$np
527
528         sub             $rp,$arrsz,$rp          ; rewind rp
529         subi            0,$arrsz,$idx
530         ldo             `$LOCALS+32`($fp),$tp
531 L\$copy
532         ldd             $idx($np),$hi0
533         std,ma          %r0,8($tp)
534         addib,<>        8,$idx,.-8              ; L\$copy
535         std,ma          $hi0,8($rp)     
536 ___
537
538 if ($BN_SZ==4) {                                # PA-RISC 1.1 code-path
539 $ablo=$ab0;
540 $abhi=$ab1;
541 $nmlo0=$nm0;
542 $nmhi0=$nm1;
543 $nmlo1="%r9";
544 $nmhi1="%r8";
545
546 $code.=<<___;
547         b               L\$done
548         nop
549
550         .ALIGN          8
551 L\$parisc11
552         ldw             -16($xfer),$hi0
553         ldw             -12($xfer),$ablo
554         ldw             -8($xfer),$nmhi0
555         ldw             -4($xfer),$nmlo0
556
557          ldo            8($idx),$idx            ; j++++
558          add            $ablo,$nmlo0,$nmlo0     ; discarded
559          addc           %r0,$nmhi0,$hi1
560         ldw             0($xfer),$abhi
561         ldw             4($xfer),$ablo
562         nop
563 \f
564 L\$1st_pa11
565         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
566          ldw            8($xfer),$nmhi1
567         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
568          ldw            12($xfer),$nmlo1
569         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
570         fstds           ${fab0},-16($xfer)
571         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
572         fstds           ${fnm0},-8($xfer)
573          add            $hi0,$ablo,$ablo
574         fstds           ${fab1},0($xfer)
575          addc           %r0,$abhi,$hi0
576         fstds           ${fnm1},8($xfer)
577          add            $ablo,$nmlo1,$nmlo1
578         ldw             -16($xfer),$abhi
579          addc           %r0,$nmhi1,$nmhi1
580         ldw             -12($xfer),$ablo
581          add            $hi1,$nmlo1,$nmlo1
582         ldw             -8($xfer),$nmhi0
583          addc           %r0,$nmhi1,$hi1
584         ldw             -4($xfer),$nmlo0
585
586          add            $hi0,$ablo,$ablo
587         flddx           $idx($ap),${fai}        ; ap[j,j+1]
588          addc           %r0,$abhi,$hi0
589         flddx           $idx($np),${fni}        ; np[j,j+1]
590          add            $ablo,$nmlo0,$nmlo0
591         stw             $nmlo1,-4($tp)          ; tp[j-1]
592          addc           %r0,$nmhi0,$nmhi0
593         ldw             0($xfer),$abhi
594          add            $hi1,$nmlo0,$nmlo0
595         ldw             4($xfer),$ablo
596          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
597         addib,<>        8,$idx,L\$1st_pa11      ; j++++
598          addc           %r0,$nmhi0,$hi1
599
600         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
601          ldw            8($xfer),$nmhi1
602         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
603          ldw            12($xfer),$nmlo1
604         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
605         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
606         fstds           ${fab0},-16($xfer)
607         fstds           ${fnm0},-8($xfer)
608          add            $hi0,$ablo,$ablo
609         fstds           ${fab1},0($xfer)
610          addc           %r0,$abhi,$hi0
611         fstds           ${fnm1},8($xfer)
612          add            $ablo,$nmlo1,$nmlo1
613         ldw             -16($xfer),$abhi
614          addc           %r0,$nmhi1,$nmhi1
615         ldw             -12($xfer),$ablo
616          add            $hi1,$nmlo1,$nmlo1
617         ldw             -8($xfer),$nmhi0
618          addc           %r0,$nmhi1,$hi1
619         ldw             -4($xfer),$nmlo0
620
621          add            $hi0,$ablo,$ablo
622         stw             $nmlo1,-4($tp)          ; tp[j-1]
623          addc           %r0,$abhi,$hi0
624         ldw             0($xfer),$abhi
625          add            $ablo,$nmlo0,$nmlo0
626         ldw             4($xfer),$ablo
627          addc           %r0,$nmhi0,$nmhi0
628         ldws,mb         8($xfer),$nmhi1
629          add            $hi1,$nmlo0,$nmlo0
630         ldw             4($xfer),$nmlo1
631          addc           %r0,$nmhi0,$hi1
632         stws,ma         $nmlo0,8($tp)           ; tp[j-1]
633
634         ldo             -1($num),$num           ; i--
635         subi            0,$arrsz,$idx           ; j=0
636
637          fldws,ma       4($bp),${fbi}           ; bp[1]
638          flddx          $idx($ap),${fai}        ; ap[0,1]
639          flddx          $idx($np),${fni}        ; np[0,1]
640          fldws          8($xfer),${fti}R        ; tp[0]
641         add             $hi0,$ablo,$ablo
642         addc            %r0,$abhi,$hi0
643          ldo            8($idx),$idx            ; j++++
644          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
645          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
646         add             $hi1,$nmlo1,$nmlo1
647         addc            %r0,$nmhi1,$nmhi1
648         add             $ablo,$nmlo1,$nmlo1
649         addc            %r0,$nmhi1,$hi1
650          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
651         stw             $nmlo1,-4($tp)          ; tp[j-1]
652
653          fcpy,sgl       %fr0,${fti}L            ; zero high part
654          fcpy,sgl       %fr0,${fab0}L
655         add             $hi1,$hi0,$hi0
656         addc            %r0,%r0,$hi1
657          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
658          fcnvxf,dbl,dbl ${fab0},${fab0}
659         stw             $hi0,0($tp)
660         stw             $hi1,4($tp)
661
662         fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
663         fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
664         xmpyu           ${fn0},${fab0}R,${fm0}
665         ldo             `$LOCALS+32+4`($fp),$tp
666 L\$outer_pa11
667         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
668         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
669         fstds           ${fab0},-16($xfer)      ; 33-bit value
670         fstds           ${fnm0},-8($xfer)
671          flddx          $idx($ap),${fai}        ; ap[2,3]
672          flddx          $idx($np),${fni}        ; np[2,3]
673         ldw             -16($xfer),$abhi        ; carry bit actually
674          ldo            8($idx),$idx            ; j++++
675         ldw             -12($xfer),$ablo
676         ldw             -8($xfer),$nmhi0
677         ldw             -4($xfer),$nmlo0
678         ldw             0($xfer),$hi0           ; high part
679
680         fstds           ${fab1},0($xfer)
681          addl           $abhi,$hi0,$hi0         ; account carry bit
682         fstds           ${fnm1},8($xfer)
683          add            $ablo,$nmlo0,$nmlo0     ; discarded
684         ldw             0($tp),$ti1             ; tp[1]
685          addc           %r0,$nmhi0,$hi1
686         ldw             0($xfer),$abhi
687         ldw             4($xfer),$ablo
688 \f
689 L\$inner_pa11
690         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
691          ldw            8($xfer),$nmhi1
692         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
693          ldw            12($xfer),$nmlo1
694         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
695         fstds           ${fab0},-16($xfer)
696         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
697         fstds           ${fnm0},-8($xfer)
698          add            $hi0,$ablo,$ablo
699         ldw             4($tp),$ti0             ; tp[j]
700          addc           %r0,$abhi,$abhi
701         fstds           ${fab1},0($xfer)
702          add            $ti1,$ablo,$ablo
703         fstds           ${fnm1},8($xfer)
704          addc           %r0,$abhi,$hi0
705         ldw             -16($xfer),$abhi
706          add            $ablo,$nmlo1,$nmlo1
707         ldw             -12($xfer),$ablo
708          addc           %r0,$nmhi1,$nmhi1
709         ldw             -8($xfer),$nmhi0
710          add            $hi1,$nmlo1,$nmlo1
711         ldw             -4($xfer),$nmlo0
712          addc           %r0,$nmhi1,$hi1
713
714         flddx           $idx($ap),${fai}        ; ap[j,j+1]
715          addl,nuv       $hi0,$ablo,$ablo
716          addi           1,$abhi,$abhi
717         flddx           $idx($np),${fni}        ; np[j,j+1]
718          add            $ti0,$ablo,$ablo
719         stw             $nmlo1,-4($tp)          ; tp[j-1]
720          addc           %r0,$abhi,$hi0
721         ldw             8($tp),$ti1             ; tp[j]
722          addl,nuv       $ablo,$nmlo0,$nmlo0
723          addi           1,$nmhi0,$nmhi0
724         ldw             0($xfer),$abhi
725          add            $hi1,$nmlo0,$nmlo0
726         ldw             4($xfer),$ablo
727          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
728         addib,<>        8,$idx,L\$inner_pa11    ; j++++
729          addc           %r0,$nmhi0,$hi1
730
731         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
732          ldw            8($xfer),$nmhi1
733         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
734          ldw            12($xfer),$nmlo1
735         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
736         fstds           ${fab0},-16($xfer)
737         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
738         fstds           ${fnm0},-8($xfer)
739          add            $hi0,$ablo,$ablo
740         ldw             4($tp),$ti0             ; tp[j]
741          addc           %r0,$abhi,$abhi
742         fstds           ${fab1},0($xfer)
743          add            $ti1,$ablo,$ablo
744         fstds           ${fnm1},8($xfer)
745          addc           %r0,$abhi,$hi0
746         ldw             -16($xfer),$abhi
747          add            $ablo,$nmlo1,$nmlo1
748         ldw             -12($xfer),$ablo
749          addc           %r0,$nmhi1,$nmhi1
750         ldw             -8($xfer),$nmhi0
751          add            $hi1,$nmlo1,$nmlo1
752         ldw             -4($xfer),$nmlo0
753          addc           %r0,$nmhi1,$hi1
754
755         add             $hi0,$ablo,$ablo
756          stw            $nmlo1,-4($tp)          ; tp[j-1]
757         addc            %r0,$abhi,$abhi
758          add            $ti0,$ablo,$ablo
759         ldw             8($tp),$ti1             ; tp[j]
760          addc           %r0,$abhi,$hi0
761         ldw             0($xfer),$abhi
762          add            $ablo,$nmlo0,$nmlo0
763         ldw             4($xfer),$ablo
764          addc           %r0,$nmhi0,$nmhi0
765         ldws,mb         8($xfer),$nmhi1
766          add            $hi1,$nmlo0,$nmlo0
767         ldw             4($xfer),$nmlo1
768          addc           %r0,$nmhi0,$hi1
769          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
770
771         addib,=         -1,$num,L\$outerdone_pa11; i--
772         subi            0,$arrsz,$idx           ; j=0
773
774          fldws,ma       4($bp),${fbi}           ; bp[i]
775          flddx          $idx($ap),${fai}        ; ap[0]
776         add             $hi0,$ablo,$ablo
777         addc            %r0,$abhi,$abhi
778          flddx          $idx($np),${fni}        ; np[0]
779          fldws          8($xfer),${fti}R        ; tp[0]
780         add             $ti1,$ablo,$ablo
781         addc            %r0,$abhi,$hi0
782
783          ldo            8($idx),$idx            ; j++++
784          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
785          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
786         ldw             4($tp),$ti0             ; tp[j]
787
788         add             $hi1,$nmlo1,$nmlo1
789         addc            %r0,$nmhi1,$nmhi1
790          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
791         add             $ablo,$nmlo1,$nmlo1
792         addc            %r0,$nmhi1,$hi1
793          fcpy,sgl       %fr0,${fti}L            ; zero high part
794          fcpy,sgl       %fr0,${fab0}L
795         stw             $nmlo1,-4($tp)          ; tp[j-1]
796
797          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
798          fcnvxf,dbl,dbl ${fab0},${fab0}
799         add             $hi1,$hi0,$hi0
800         addc            %r0,%r0,$hi1
801          fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
802         add             $ti0,$hi0,$hi0
803         addc            %r0,$hi1,$hi1
804          fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
805         stw             $hi0,0($tp)
806         stw             $hi1,4($tp)
807          xmpyu          ${fn0},${fab0}R,${fm0}
808
809         b               L\$outer_pa11
810         ldo             `$LOCALS+32+4`($fp),$tp
811 \f
812 L\$outerdone_pa11
813         add             $hi0,$ablo,$ablo
814         addc            %r0,$abhi,$abhi
815         add             $ti1,$ablo,$ablo
816         addc            %r0,$abhi,$hi0
817
818         ldw             4($tp),$ti0             ; tp[j]
819
820         add             $hi1,$nmlo1,$nmlo1
821         addc            %r0,$nmhi1,$nmhi1
822         add             $ablo,$nmlo1,$nmlo1
823         addc            %r0,$nmhi1,$hi1
824         stw             $nmlo1,-4($tp)          ; tp[j-1]
825
826         add             $hi1,$hi0,$hi0
827         addc            %r0,%r0,$hi1
828         add             $ti0,$hi0,$hi0
829         addc            %r0,$hi1,$hi1
830         stw             $hi0,0($tp)
831         stw             $hi1,4($tp)
832
833         ldo             `$LOCALS+32+4`($fp),$tp
834         sub             %r0,%r0,%r0             ; clear borrow
835         ldw             -4($tp),$ti0
836         addl            $tp,$arrsz,$tp
837 L\$sub_pa11
838         ldwx            $idx($np),$hi0
839         subb            $ti0,$hi0,$hi1
840         ldwx            $idx($tp),$ti0
841         addib,<>        4,$idx,L\$sub_pa11
842         stws,ma         $hi1,4($rp)
843
844         subb            $ti0,%r0,$hi1
845         ldo             -4($tp),$tp
846         and             $tp,$hi1,$ap
847         andcm           $rp,$hi1,$bp
848         or              $ap,$bp,$np
849
850         sub             $rp,$arrsz,$rp          ; rewind rp
851         subi            0,$arrsz,$idx
852         ldo             `$LOCALS+32`($fp),$tp
853 L\$copy_pa11
854         ldwx            $idx($np),$hi0
855         stws,ma         %r0,4($tp)
856         addib,<>        4,$idx,L\$copy_pa11
857         stws,ma         $hi0,4($rp)     
858
859         nop                                     ; alignment
860 L\$done
861 ___
862 }
863 \f
864 $code.=<<___;
865         ldi             1,%r28                  ; signal "handled"
866         ldo             $FRAME($fp),%sp         ; destroy tp[num+1]
867
868         $POP    `-$FRAME-$SAVED_RP`(%sp),%r2    ; standard epilogue
869         $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
870         $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
871         $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
872         $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
873         $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
874         $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
875         $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
876 L\$abort
877         bv      (%r2)
878         .EXIT
879         $POPMB  -$FRAME(%sp),%r3
880         .PROCEND
881         .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
882 ___
883 \f
884 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
885 # that it can be compiled with .LEVEL 1.0. It should be noted that I
886 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
887 # directive...
888
889 my $ldd = sub {
890   my ($mod,$args) = @_;
891   my $orig = "ldd$mod\t$args";
892
893     if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)         # format 4
894     {   my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
895         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
896     }
897     elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)     # format 5
898     {   my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
899         $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);                # encode offset
900         $opcode|=(1<<5)  if ($mod =~ /^,m/);
901         $opcode|=(1<<13) if ($mod =~ /^,mb/);
902         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
903     }
904     else { "\t".$orig; }
905 };
906
907 my $std = sub {
908   my ($mod,$args) = @_;
909   my $orig = "std$mod\t$args";
910
911     if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)        # format 6
912     {   my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
913         $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);                  # encode offset
914         $opcode|=(1<<5)  if ($mod =~ /^,m/);
915         $opcode|=(1<<13) if ($mod =~ /^,mb/);
916         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
917     }
918     else { "\t".$orig; }
919 };
920
921 my $extrd = sub {
922   my ($mod,$args) = @_;
923   my $orig = "extrd$mod\t$args";
924
925     # I only have ",u" completer, it's implicitly encoded...
926     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
927     {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
928         my $len=32-$3;
929         $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
930         $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
931         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
932     }
933     elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
934     {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
935         my $len=32-$2;
936         $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
937         $opcode |= (1<<13) if ($mod =~ /,\**=/);
938         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
939     }
940     else { "\t".$orig; }
941 };
942
943 my $shrpd = sub {
944   my ($mod,$args) = @_;
945   my $orig = "shrpd$mod\t$args";
946
947     if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
948     {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
949         my $cpos=63-$3;
950         $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
951         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
952     }
953     else { "\t".$orig; }
954 };
955
956 my $sub = sub {
957   my ($mod,$args) = @_;
958   my $orig = "sub$mod\t$args";
959
960     if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
961         my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
962         $opcode|=(1<<10);       # e1
963         $opcode|=(1<<8);        # e2
964         $opcode|=(1<<5);        # d
965         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
966     }
967     else { "\t".$orig; }
968 };
969
970 sub assemble {
971   my ($mnemonic,$mod,$args)=@_;
972   my $opcode = eval("\$$mnemonic");
973
974     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
975 }
976
977 foreach (split("\n",$code)) {
978         s/\`([^\`]*)\`/eval $1/ge;
979         # flip word order in 64-bit mode...
980         s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
981         # assemble 2.0 instructions in 32-bit mode...
982         s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
983
984         print $_,"\n";
985 }
986 close STDOUT;