8aa94e8511c9f431db6893a99a5a9f797d99d5cf
[openssl.git] / crypto / bn / asm / parisc-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # On PA-7100LC this module performs ~90-50% better, less for longer
18 # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
19 # that compiler utilized xmpyu instruction to perform 32x32=64-bit
20 # multiplication, which in turn means that "baseline" performance was
21 # optimal in respect to instruction set capabilities. Fair comparison
22 # with vendor compiler is problematic, because OpenSSL doesn't define
23 # BN_LLONG [presumably] for historical reasons, which drives compiler
24 # toward 4 times 16x16=32-bit multiplicatons [plus complementary
25 # shifts and additions] instead. This means that you should observe
26 # several times improvement over code generated by vendor compiler
27 # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
28 # improvement coefficient was never collected on PA-7100LC, or any
29 # other 1.1 CPU, because I don't have access to such machine with
30 # vendor compiler. But to give you a taste, PA-RISC 1.1 code path
31 # reportedly outperformed code generated by cc +DA1.1 +O3 by factor
32 # of ~5x on PA-8600.
33 #
34 # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
35 # reportedly ~2x faster than vendor compiler generated code [according
36 # to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
37 # this implementation is actually 32-bit one, in the sense that it
38 # operates on 32-bit values. But pa-risc2[W].s operates on arrays of
39 # 64-bit BN_LONGs... How do they interoperate then? No problem. This
40 # module picks halves of 64-bit values in reverse order and pretends
41 # they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
42 # 64-bit code such as pa-risc2[W].s then? Well, the thing is that
43 # 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
44 # i.e. there is no "wider" multiplication like on most other 64-bit
45 # platforms. This means that even being effectively 32-bit, this
46 # implementation performs "64-bit" computational task in same amount
47 # of arithmetic operations, most notably multiplications. It requires
48 # more memory references, most notably to tp[num], but this doesn't
49 # seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
50 # 2.0 code path provides virtually same performance as pa-risc2[W].s:
51 # it's ~10% better for shortest key length and ~10% worse for longest
52 # one.
53 #
54 # In case it wasn't clear. The module has two distinct code paths:
55 # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
56 # additions and 64-bit integer loads, not to mention specific
57 # instruction scheduling. In 64-bit build naturally only 2.0 code path
58 # is assembled. In 32-bit application context both code paths are
59 # assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
60 # is taken automatically. Also, in 32-bit build the module imposes
61 # couple of limitations: vector lengths has to be even and vector
62 # addresses has to be 64-bit aligned. Normally neither is a problem:
63 # most common key lengths are even and vectors are commonly malloc-ed,
64 # which ensures alignment.
65 #
66 # Special thanks to polarhome.com for providing HP-UX account on
67 # PA-RISC 1.1 machine, and to correspondent who chose to remain
68 # anonymous for testing the code on PA-RISC 2.0 machine.
69 \f
70 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71
72 $flavour = shift;
73 $output = shift;
74
75 open STDOUT,">$output";
76
77 if ($flavour =~ /64/) {
78         $LEVEL          ="2.0W";
79         $SIZE_T         =8;
80         $FRAME_MARKER   =80;
81         $SAVED_RP       =16;
82         $PUSH           ="std";
83         $PUSHMA         ="std,ma";
84         $POP            ="ldd";
85         $POPMB          ="ldd,mb";
86         $BN_SZ          =$SIZE_T;
87 } else {
88         $LEVEL          ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
89         $SIZE_T         =4;
90         $FRAME_MARKER   =48;
91         $SAVED_RP       =20;
92         $PUSH           ="stw";
93         $PUSHMA         ="stwm";
94         $POP            ="ldw";
95         $POPMB          ="ldwm";
96         $BN_SZ          =$SIZE_T;
97         if (open CONF,"<${dir}../../opensslconf.h") {
98             while(<CONF>) {
99                 if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
100                     $BN_SZ=8;
101                     $LEVEL="2.0";
102                     last;
103                 }
104             }
105             close CONF;
106         }
107 }
108
109 $FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
110                                 #                [+ argument transfer]
111 $LOCALS=$FRAME-$FRAME_MARKER;
112 $FRAME+=32;                     # local variables
113
114 $tp="%r31";
115 $ti1="%r29";
116 $ti0="%r28";
117
118 $rp="%r26";
119 $ap="%r25";
120 $bp="%r24";
121 $np="%r23";
122 $n0="%r22";     # passed through stack in 32-bit
123 $num="%r21";    # passed through stack in 32-bit
124 $idx="%r20";
125 $arrsz="%r19";
126
127 $nm1="%r7";
128 $nm0="%r6";
129 $ab1="%r5";
130 $ab0="%r4";
131
132 $fp="%r3";
133 $hi1="%r2";
134 $hi0="%r1";
135
136 $xfer=$n0;      # accommodates [-16..15] offset in fld[dw]s
137
138 $fm0="%fr4";    $fti=$fm0;
139 $fbi="%fr5L";
140 $fn0="%fr5R";
141 $fai="%fr6";    $fab0="%fr7";   $fab1="%fr8";
142 $fni="%fr9";    $fnm0="%fr10";  $fnm1="%fr11";
143
144 $code=<<___;
145         .LEVEL  $LEVEL
146         .SPACE  \$TEXT\$
147         .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
148
149         .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
150         .ALIGN  64
151 bn_mul_mont
152         .PROC
153         .CALLINFO       FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
154         .ENTRY
155         $PUSH   %r2,-$SAVED_RP(%sp)             ; standard prologue
156         $PUSHMA %r3,$FRAME(%sp)
157         $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
158         $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
159         $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
160         $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
161         $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
162         $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
163         $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
164         ldo     -$FRAME(%sp),$fp
165 ___
166 $code.=<<___ if ($SIZE_T==4);
167         ldw     `-$FRAME_MARKER-4`($fp),$n0
168         ldw     `-$FRAME_MARKER-8`($fp),$num
169         nop
170         nop                                     ; alignment
171 ___
172 $code.=<<___ if ($BN_SZ==4);
173         comiclr,<=      6,$num,%r0              ; are vectors long enough?
174         b               L\$abort
175         ldi             0,%r28                  ; signal "unhandled"
176         add,ev          %r0,$num,$num           ; is $num even?
177         b               L\$abort
178         nop
179         or              $ap,$np,$ti1
180         extru,=         $ti1,31,3,%r0           ; are ap and np 64-bit aligned?
181         b               L\$abort
182         nop
183         nop                                     ; alignment
184         nop
185
186         fldws           0($n0),${fn0}
187         fldws,ma        4($bp),${fbi}           ; bp[0]
188 ___
189 $code.=<<___ if ($BN_SZ==8);
190         comib,>         3,$num,L\$abort         ; are vectors long enough?
191         ldi             0,%r28                  ; signal "unhandled"
192         addl            $num,$num,$num          ; I operate on 32-bit values
193
194         fldws           4($n0),${fn0}           ; only low part of n0
195         fldws           4($bp),${fbi}           ; bp[0] in flipped word order
196 ___
197 $code.=<<___;
198         fldds           0($ap),${fai}           ; ap[0,1]
199         fldds           0($np),${fni}           ; np[0,1]
200
201         sh2addl         $num,%r0,$arrsz
202         ldi             31,$hi0
203         ldo             36($arrsz),$hi1         ; space for tp[num+1]
204         andcm           $hi1,$hi0,$hi1          ; align
205         addl            $hi1,%sp,%sp
206         $PUSH           $fp,-$SIZE_T(%sp)
207
208         ldo             `$LOCALS+16`($fp),$xfer
209         ldo             `$LOCALS+32+4`($fp),$tp
210
211         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[0]*bp[0]
212         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[1]*bp[0]
213         xmpyu           ${fn0},${fab0}R,${fm0}
214
215         addl            $arrsz,$ap,$ap          ; point at the end
216         addl            $arrsz,$np,$np
217         subi            0,$arrsz,$idx           ; j=0
218         ldo             8($idx),$idx            ; j++++
219
220         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
221         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
222         fstds           ${fab0},-16($xfer)
223         fstds           ${fnm0},-8($xfer)
224         fstds           ${fab1},0($xfer)
225         fstds           ${fnm1},8($xfer)
226          flddx          $idx($ap),${fai}        ; ap[2,3]
227          flddx          $idx($np),${fni}        ; np[2,3]
228 ___
229 $code.=<<___ if ($BN_SZ==4);
230         mtctl           $hi0,%cr11              ; $hi0 still holds 31
231         extrd,u,*=      $hi0,%sar,1,$hi0        ; executes on PA-RISC 1.0
232         b               L\$parisc11
233         nop
234 ___
235 $code.=<<___;                                   # PA-RISC 2.0 code-path
236         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
237         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
238         ldd             -16($xfer),$ab0
239         fstds           ${fab0},-16($xfer)
240
241         extrd,u         $ab0,31,32,$hi0
242         extrd,u         $ab0,63,32,$ab0
243         ldd             -8($xfer),$nm0
244         fstds           ${fnm0},-8($xfer)
245          ldo            8($idx),$idx            ; j++++
246          addl           $ab0,$nm0,$nm0          ; low part is discarded
247          extrd,u        $nm0,31,32,$hi1
248 \f
249 L\$1st
250         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
251         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
252         ldd             0($xfer),$ab1
253         fstds           ${fab1},0($xfer)
254          addl           $hi0,$ab1,$ab1
255          extrd,u        $ab1,31,32,$hi0
256         ldd             8($xfer),$nm1
257         fstds           ${fnm1},8($xfer)
258          extrd,u        $ab1,63,32,$ab1
259          addl           $hi1,$nm1,$nm1
260         flddx           $idx($ap),${fai}        ; ap[j,j+1]
261         flddx           $idx($np),${fni}        ; np[j,j+1]
262          addl           $ab1,$nm1,$nm1
263          extrd,u        $nm1,31,32,$hi1
264
265         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
266         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
267         ldd             -16($xfer),$ab0
268         fstds           ${fab0},-16($xfer)
269          addl           $hi0,$ab0,$ab0
270          extrd,u        $ab0,31,32,$hi0
271         ldd             -8($xfer),$nm0
272         fstds           ${fnm0},-8($xfer)
273          extrd,u        $ab0,63,32,$ab0
274          addl           $hi1,$nm0,$nm0
275         stw             $nm1,-4($tp)            ; tp[j-1]
276          addl           $ab0,$nm0,$nm0
277          stw,ma         $nm0,8($tp)             ; tp[j-1]
278         addib,<>        8,$idx,L\$1st           ; j++++
279          extrd,u        $nm0,31,32,$hi1
280
281         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
282         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
283         ldd             0($xfer),$ab1
284         fstds           ${fab1},0($xfer)
285          addl           $hi0,$ab1,$ab1
286          extrd,u        $ab1,31,32,$hi0
287         ldd             8($xfer),$nm1
288         fstds           ${fnm1},8($xfer)
289          extrd,u        $ab1,63,32,$ab1
290          addl           $hi1,$nm1,$nm1
291         ldd             -16($xfer),$ab0
292          addl           $ab1,$nm1,$nm1
293         ldd             -8($xfer),$nm0
294          extrd,u        $nm1,31,32,$hi1
295
296          addl           $hi0,$ab0,$ab0
297          extrd,u        $ab0,31,32,$hi0
298         stw             $nm1,-4($tp)            ; tp[j-1]
299          extrd,u        $ab0,63,32,$ab0
300          addl           $hi1,$nm0,$nm0
301         ldd             0($xfer),$ab1
302          addl           $ab0,$nm0,$nm0
303         ldd,mb          8($xfer),$nm1
304          extrd,u        $nm0,31,32,$hi1
305         stw,ma          $nm0,8($tp)             ; tp[j-1]
306
307         ldo             -1($num),$num           ; i--
308         subi            0,$arrsz,$idx           ; j=0
309 ___
310 $code.=<<___ if ($BN_SZ==4);
311         fldws,ma        4($bp),${fbi}           ; bp[1]
312 ___
313 $code.=<<___ if ($BN_SZ==8);
314         fldws           0($bp),${fbi}           ; bp[1] in flipped word order
315 ___
316 $code.=<<___;
317          flddx          $idx($ap),${fai}        ; ap[0,1]
318          flddx          $idx($np),${fni}        ; np[0,1]
319          fldws          8($xfer),${fti}R        ; tp[0]
320         addl            $hi0,$ab1,$ab1
321          extrd,u        $ab1,31,32,$hi0
322          extrd,u        $ab1,63,32,$ab1
323          ldo            8($idx),$idx            ; j++++
324          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
325          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
326         addl            $hi1,$nm1,$nm1
327         addl            $ab1,$nm1,$nm1
328         extrd,u         $nm1,31,32,$hi1
329          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
330         stw             $nm1,-4($tp)            ; tp[j-1]
331
332          fcpy,sgl       %fr0,${fti}L            ; zero high part
333          fcpy,sgl       %fr0,${fab0}L
334         addl            $hi1,$hi0,$hi0
335         extrd,u         $hi0,31,32,$hi1
336          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
337          fcnvxf,dbl,dbl ${fab0},${fab0}
338         stw             $hi0,0($tp)
339         stw             $hi1,4($tp)
340
341         fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
342         fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
343         xmpyu           ${fn0},${fab0}R,${fm0}
344         ldo             `$LOCALS+32+4`($fp),$tp
345 L\$outer
346         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
347         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
348         fstds           ${fab0},-16($xfer)      ; 33-bit value
349         fstds           ${fnm0},-8($xfer)
350          flddx          $idx($ap),${fai}        ; ap[2]
351          flddx          $idx($np),${fni}        ; np[2]
352          ldo            8($idx),$idx            ; j++++
353         ldd             -16($xfer),$ab0         ; 33-bit value
354         ldd             -8($xfer),$nm0
355         ldw             0($xfer),$hi0           ; high part
356
357         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
358         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
359          extrd,u        $ab0,31,32,$ti0         ; carry bit
360          extrd,u        $ab0,63,32,$ab0
361         fstds           ${fab1},0($xfer)
362          addl           $ti0,$hi0,$hi0          ; account carry bit
363         fstds           ${fnm1},8($xfer)
364          addl           $ab0,$nm0,$nm0          ; low part is discarded
365         ldw             0($tp),$ti1             ; tp[1]
366          extrd,u        $nm0,31,32,$hi1
367         fstds           ${fab0},-16($xfer)
368         fstds           ${fnm0},-8($xfer)
369 \f
370 L\$inner
371         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
372         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
373         ldd             0($xfer),$ab1
374         fstds           ${fab1},0($xfer)
375          addl           $hi0,$ti1,$ti1
376          addl           $ti1,$ab1,$ab1
377         ldd             8($xfer),$nm1
378         fstds           ${fnm1},8($xfer)
379          extrd,u        $ab1,31,32,$hi0
380          extrd,u        $ab1,63,32,$ab1
381         flddx           $idx($ap),${fai}        ; ap[j,j+1]
382         flddx           $idx($np),${fni}        ; np[j,j+1]
383          addl           $hi1,$nm1,$nm1
384          addl           $ab1,$nm1,$nm1
385         ldw             4($tp),$ti0             ; tp[j]
386         stw             $nm1,-4($tp)            ; tp[j-1]
387
388         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
389         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
390         ldd             -16($xfer),$ab0
391         fstds           ${fab0},-16($xfer)
392          addl           $hi0,$ti0,$ti0
393          addl           $ti0,$ab0,$ab0
394         ldd             -8($xfer),$nm0
395         fstds           ${fnm0},-8($xfer)
396          extrd,u        $ab0,31,32,$hi0
397          extrd,u        $nm1,31,32,$hi1
398         ldw             8($tp),$ti1             ; tp[j]
399          extrd,u        $ab0,63,32,$ab0
400          addl           $hi1,$nm0,$nm0
401          addl           $ab0,$nm0,$nm0
402          stw,ma         $nm0,8($tp)             ; tp[j-1]
403         addib,<>        8,$idx,L\$inner         ; j++++
404          extrd,u        $nm0,31,32,$hi1
405
406         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
407         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
408         ldd             0($xfer),$ab1
409         fstds           ${fab1},0($xfer)
410          addl           $hi0,$ti1,$ti1
411          addl           $ti1,$ab1,$ab1
412         ldd             8($xfer),$nm1
413         fstds           ${fnm1},8($xfer)
414          extrd,u        $ab1,31,32,$hi0
415          extrd,u        $ab1,63,32,$ab1
416         ldw             4($tp),$ti0             ; tp[j]
417          addl           $hi1,$nm1,$nm1
418          addl           $ab1,$nm1,$nm1
419         ldd             -16($xfer),$ab0
420         ldd             -8($xfer),$nm0
421          extrd,u        $nm1,31,32,$hi1
422
423         addl            $hi0,$ab0,$ab0
424          addl           $ti0,$ab0,$ab0
425          stw            $nm1,-4($tp)            ; tp[j-1]
426          extrd,u        $ab0,31,32,$hi0
427         ldw             8($tp),$ti1             ; tp[j]
428          extrd,u        $ab0,63,32,$ab0
429          addl           $hi1,$nm0,$nm0
430         ldd             0($xfer),$ab1
431          addl           $ab0,$nm0,$nm0
432         ldd,mb          8($xfer),$nm1
433          extrd,u        $nm0,31,32,$hi1
434          stw,ma         $nm0,8($tp)             ; tp[j-1]
435
436         addib,=         -1,$num,L\$outerdone    ; i--
437         subi            0,$arrsz,$idx           ; j=0
438 ___
439 $code.=<<___ if ($BN_SZ==4);
440         fldws,ma        4($bp),${fbi}           ; bp[i]
441 ___
442 $code.=<<___ if ($BN_SZ==8);
443         ldi             12,$ti0                 ; bp[i] in flipped word order
444         addl,ev         %r0,$num,$num
445         ldi             -4,$ti0
446         addl            $ti0,$bp,$bp
447         fldws           0($bp),${fbi}
448 ___
449 $code.=<<___;
450          flddx          $idx($ap),${fai}        ; ap[0]
451         addl            $hi0,$ab1,$ab1
452          flddx          $idx($np),${fni}        ; np[0]
453          fldws          8($xfer),${fti}R        ; tp[0]
454         addl            $ti1,$ab1,$ab1
455         extrd,u         $ab1,31,32,$hi0
456         extrd,u         $ab1,63,32,$ab1
457
458          ldo            8($idx),$idx            ; j++++
459          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
460          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
461         ldw             4($tp),$ti0             ; tp[j]
462
463         addl            $hi1,$nm1,$nm1
464          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
465         addl            $ab1,$nm1,$nm1
466         extrd,u         $nm1,31,32,$hi1
467          fcpy,sgl       %fr0,${fti}L            ; zero high part
468          fcpy,sgl       %fr0,${fab0}L
469         stw             $nm1,-4($tp)            ; tp[j-1]
470
471          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
472          fcnvxf,dbl,dbl ${fab0},${fab0}
473         addl            $hi1,$hi0,$hi0
474          fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
475         addl            $ti0,$hi0,$hi0
476         extrd,u         $hi0,31,32,$hi1
477          fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
478         stw             $hi0,0($tp)
479         stw             $hi1,4($tp)
480          xmpyu          ${fn0},${fab0}R,${fm0}
481
482         b               L\$outer
483         ldo             `$LOCALS+32+4`($fp),$tp
484 \f
485 L\$outerdone
486         addl            $hi0,$ab1,$ab1
487         addl            $ti1,$ab1,$ab1
488         extrd,u         $ab1,31,32,$hi0
489         extrd,u         $ab1,63,32,$ab1
490
491         ldw             4($tp),$ti0             ; tp[j]
492
493         addl            $hi1,$nm1,$nm1
494         addl            $ab1,$nm1,$nm1
495         extrd,u         $nm1,31,32,$hi1
496         stw             $nm1,-4($tp)            ; tp[j-1]
497
498         addl            $hi1,$hi0,$hi0
499         addl            $ti0,$hi0,$hi0
500         extrd,u         $hi0,31,32,$hi1
501         stw             $hi0,0($tp)
502         stw             $hi1,4($tp)
503
504         ldo             `$LOCALS+32`($fp),$tp
505         sub             %r0,%r0,%r0             ; clear borrow
506 ___
507 $code.=<<___ if ($BN_SZ==4);
508         ldws,ma         4($tp),$ti0
509         extru,=         $rp,31,3,%r0            ; is rp 64-bit aligned?
510         b               L\$sub_pa11
511         addl            $tp,$arrsz,$tp
512 L\$sub
513         ldwx            $idx($np),$hi0
514         subb            $ti0,$hi0,$hi1
515         ldwx            $idx($tp),$ti0
516         addib,<>        4,$idx,L\$sub
517         stws,ma         $hi1,4($rp)
518
519         subb            $ti0,%r0,$hi1
520         ldo             -4($tp),$tp
521 ___
522 $code.=<<___ if ($BN_SZ==8);
523         ldd,ma          8($tp),$ti0
524 L\$sub
525         ldd             $idx($np),$hi0
526         shrpd           $ti0,$ti0,32,$ti0       ; flip word order
527         std             $ti0,-8($tp)            ; save flipped value
528         sub,db          $ti0,$hi0,$hi1
529         ldd,ma          8($tp),$ti0
530         addib,<>        8,$idx,L\$sub
531         std,ma          $hi1,8($rp)
532
533         extrd,u         $ti0,31,32,$ti0         ; carry in flipped word order
534         sub,db          $ti0,%r0,$hi1
535         ldo             -8($tp),$tp
536 ___
537 $code.=<<___;
538         and             $tp,$hi1,$ap
539         andcm           $rp,$hi1,$bp
540         or              $ap,$bp,$np
541
542         sub             $rp,$arrsz,$rp          ; rewind rp
543         subi            0,$arrsz,$idx
544         ldo             `$LOCALS+32`($fp),$tp
545 L\$copy
546         ldd             $idx($np),$hi0
547         std,ma          %r0,8($tp)
548         addib,<>        8,$idx,.-8              ; L\$copy
549         std,ma          $hi0,8($rp)     
550 ___
551
552 if ($BN_SZ==4) {                                # PA-RISC 1.1 code-path
553 $ablo=$ab0;
554 $abhi=$ab1;
555 $nmlo0=$nm0;
556 $nmhi0=$nm1;
557 $nmlo1="%r9";
558 $nmhi1="%r8";
559
560 $code.=<<___;
561         b               L\$done
562         nop
563
564         .ALIGN          8
565 L\$parisc11
566         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
567         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
568         ldw             -12($xfer),$ablo
569         ldw             -16($xfer),$hi0
570         ldw             -4($xfer),$nmlo0
571         ldw             -8($xfer),$nmhi0
572         fstds           ${fab0},-16($xfer)
573         fstds           ${fnm0},-8($xfer)
574
575          ldo            8($idx),$idx            ; j++++
576          add            $ablo,$nmlo0,$nmlo0     ; discarded
577          addc           %r0,$nmhi0,$hi1
578         ldw             4($xfer),$ablo
579         ldw             0($xfer),$abhi
580         nop
581 \f
582 L\$1st_pa11
583         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
584         flddx           $idx($ap),${fai}        ; ap[j,j+1]
585         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
586         flddx           $idx($np),${fni}        ; np[j,j+1]
587          add            $hi0,$ablo,$ablo
588         ldw             12($xfer),$nmlo1
589          addc           %r0,$abhi,$hi0
590         ldw             8($xfer),$nmhi1
591          add            $ablo,$nmlo1,$nmlo1
592         fstds           ${fab1},0($xfer)
593          addc           %r0,$nmhi1,$nmhi1
594         fstds           ${fnm1},8($xfer)
595          add            $hi1,$nmlo1,$nmlo1
596         ldw             -12($xfer),$ablo
597          addc           %r0,$nmhi1,$hi1
598         ldw             -16($xfer),$abhi
599
600         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
601         ldw             -4($xfer),$nmlo0
602         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
603         ldw             -8($xfer),$nmhi0
604          add            $hi0,$ablo,$ablo
605         stw             $nmlo1,-4($tp)          ; tp[j-1]
606          addc           %r0,$abhi,$hi0
607         fstds           ${fab0},-16($xfer)
608          add            $ablo,$nmlo0,$nmlo0
609         fstds           ${fnm0},-8($xfer)
610          addc           %r0,$nmhi0,$nmhi0
611         ldw             0($xfer),$abhi
612          add            $hi1,$nmlo0,$nmlo0
613         ldw             4($xfer),$ablo
614          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
615         addib,<>        8,$idx,L\$1st_pa11      ; j++++
616          addc           %r0,$nmhi0,$hi1
617
618          ldw            8($xfer),$nmhi1
619          ldw            12($xfer),$nmlo1
620         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
621         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
622          add            $hi0,$ablo,$ablo
623         fstds           ${fab1},0($xfer)
624          addc           %r0,$abhi,$hi0
625         fstds           ${fnm1},8($xfer)
626          add            $ablo,$nmlo1,$nmlo1
627         ldw             -16($xfer),$abhi
628          addc           %r0,$nmhi1,$nmhi1
629         ldw             -12($xfer),$ablo
630          add            $hi1,$nmlo1,$nmlo1
631         ldw             -8($xfer),$nmhi0
632          addc           %r0,$nmhi1,$hi1
633         ldw             -4($xfer),$nmlo0
634
635          add            $hi0,$ablo,$ablo
636         stw             $nmlo1,-4($tp)          ; tp[j-1]
637          addc           %r0,$abhi,$hi0
638         ldw             0($xfer),$abhi
639          add            $ablo,$nmlo0,$nmlo0
640         ldw             4($xfer),$ablo
641          addc           %r0,$nmhi0,$nmhi0
642         ldws,mb         8($xfer),$nmhi1
643          add            $hi1,$nmlo0,$nmlo0
644         ldw             4($xfer),$nmlo1
645          addc           %r0,$nmhi0,$hi1
646         stws,ma         $nmlo0,8($tp)           ; tp[j-1]
647
648         ldo             -1($num),$num           ; i--
649         subi            0,$arrsz,$idx           ; j=0
650
651          fldws,ma       4($bp),${fbi}           ; bp[1]
652          flddx          $idx($ap),${fai}        ; ap[0,1]
653          flddx          $idx($np),${fni}        ; np[0,1]
654          fldws          8($xfer),${fti}R        ; tp[0]
655         add             $hi0,$ablo,$ablo
656         addc            %r0,$abhi,$hi0
657          ldo            8($idx),$idx            ; j++++
658          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
659          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
660         add             $hi1,$nmlo1,$nmlo1
661         addc            %r0,$nmhi1,$nmhi1
662         add             $ablo,$nmlo1,$nmlo1
663         addc            %r0,$nmhi1,$hi1
664          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
665         stw             $nmlo1,-4($tp)          ; tp[j-1]
666
667          fcpy,sgl       %fr0,${fti}L            ; zero high part
668          fcpy,sgl       %fr0,${fab0}L
669         add             $hi1,$hi0,$hi0
670         addc            %r0,%r0,$hi1
671          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
672          fcnvxf,dbl,dbl ${fab0},${fab0}
673         stw             $hi0,0($tp)
674         stw             $hi1,4($tp)
675
676         fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
677         fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
678         xmpyu           ${fn0},${fab0}R,${fm0}
679         ldo             `$LOCALS+32+4`($fp),$tp
680 L\$outer_pa11
681         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
682         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
683         fstds           ${fab0},-16($xfer)      ; 33-bit value
684         fstds           ${fnm0},-8($xfer)
685          flddx          $idx($ap),${fai}        ; ap[2,3]
686          flddx          $idx($np),${fni}        ; np[2,3]
687         ldw             -16($xfer),$abhi        ; carry bit actually
688          ldo            8($idx),$idx            ; j++++
689         ldw             -12($xfer),$ablo
690         ldw             -8($xfer),$nmhi0
691         ldw             -4($xfer),$nmlo0
692         ldw             0($xfer),$hi0           ; high part
693
694         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
695         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
696         fstds           ${fab1},0($xfer)
697          addl           $abhi,$hi0,$hi0         ; account carry bit
698         fstds           ${fnm1},8($xfer)
699          add            $ablo,$nmlo0,$nmlo0     ; discarded
700         ldw             0($tp),$ti1             ; tp[1]
701          addc           %r0,$nmhi0,$hi1
702         fstds           ${fab0},-16($xfer)
703         fstds           ${fnm0},-8($xfer)
704         ldw             4($xfer),$ablo
705         ldw             0($xfer),$abhi
706 \f
707 L\$inner_pa11
708         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
709         flddx           $idx($ap),${fai}        ; ap[j,j+1]
710         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
711         flddx           $idx($np),${fni}        ; np[j,j+1]
712          add            $hi0,$ablo,$ablo
713         ldw             4($tp),$ti0             ; tp[j]
714          addc           %r0,$abhi,$abhi
715         ldw             12($xfer),$nmlo1
716          add            $ti1,$ablo,$ablo
717         ldw             8($xfer),$nmhi1
718          addc           %r0,$abhi,$hi0
719         fstds           ${fab1},0($xfer)
720          add            $ablo,$nmlo1,$nmlo1
721         fstds           ${fnm1},8($xfer)
722          addc           %r0,$nmhi1,$nmhi1
723         ldw             -12($xfer),$ablo
724          add            $hi1,$nmlo1,$nmlo1
725         ldw             -16($xfer),$abhi
726          addc           %r0,$nmhi1,$hi1
727
728         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
729         ldw             8($tp),$ti1             ; tp[j]
730         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
731         ldw             -4($xfer),$nmlo0
732          add            $hi0,$ablo,$ablo
733         ldw             -8($xfer),$nmhi0
734          addc           %r0,$abhi,$abhi
735         stw             $nmlo1,-4($tp)          ; tp[j-1]
736          add            $ti0,$ablo,$ablo
737         fstds           ${fab0},-16($xfer)
738          addc           %r0,$abhi,$hi0
739         fstds           ${fnm0},-8($xfer)
740          add            $ablo,$nmlo0,$nmlo0
741         ldw             4($xfer),$ablo
742          addc           %r0,$nmhi0,$nmhi0
743         ldw             0($xfer),$abhi
744          add            $hi1,$nmlo0,$nmlo0
745          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
746         addib,<>        8,$idx,L\$inner_pa11    ; j++++
747          addc           %r0,$nmhi0,$hi1
748
749         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
750         ldw             12($xfer),$nmlo1
751         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
752         ldw             8($xfer),$nmhi1
753          add            $hi0,$ablo,$ablo
754         ldw             4($tp),$ti0             ; tp[j]
755          addc           %r0,$abhi,$abhi
756         fstds           ${fab1},0($xfer)
757          add            $ti1,$ablo,$ablo
758         fstds           ${fnm1},8($xfer)
759          addc           %r0,$abhi,$hi0
760         ldw             -16($xfer),$abhi
761          add            $ablo,$nmlo1,$nmlo1
762         ldw             -12($xfer),$ablo
763          addc           %r0,$nmhi1,$nmhi1
764         ldw             -8($xfer),$nmhi0
765          add            $hi1,$nmlo1,$nmlo1
766         ldw             -4($xfer),$nmlo0
767          addc           %r0,$nmhi1,$hi1
768
769         add             $hi0,$ablo,$ablo
770          stw            $nmlo1,-4($tp)          ; tp[j-1]
771         addc            %r0,$abhi,$abhi
772          add            $ti0,$ablo,$ablo
773         ldw             8($tp),$ti1             ; tp[j]
774          addc           %r0,$abhi,$hi0
775         ldw             0($xfer),$abhi
776          add            $ablo,$nmlo0,$nmlo0
777         ldw             4($xfer),$ablo
778          addc           %r0,$nmhi0,$nmhi0
779         ldws,mb         8($xfer),$nmhi1
780          add            $hi1,$nmlo0,$nmlo0
781         ldw             4($xfer),$nmlo1
782          addc           %r0,$nmhi0,$hi1
783          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
784
785         addib,=         -1,$num,L\$outerdone_pa11; i--
786         subi            0,$arrsz,$idx           ; j=0
787
788          fldws,ma       4($bp),${fbi}           ; bp[i]
789          flddx          $idx($ap),${fai}        ; ap[0]
790         add             $hi0,$ablo,$ablo
791         addc            %r0,$abhi,$abhi
792          flddx          $idx($np),${fni}        ; np[0]
793          fldws          8($xfer),${fti}R        ; tp[0]
794         add             $ti1,$ablo,$ablo
795         addc            %r0,$abhi,$hi0
796
797          ldo            8($idx),$idx            ; j++++
798          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
799          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
800         ldw             4($tp),$ti0             ; tp[j]
801
802         add             $hi1,$nmlo1,$nmlo1
803         addc            %r0,$nmhi1,$nmhi1
804          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
805         add             $ablo,$nmlo1,$nmlo1
806         addc            %r0,$nmhi1,$hi1
807          fcpy,sgl       %fr0,${fti}L            ; zero high part
808          fcpy,sgl       %fr0,${fab0}L
809         stw             $nmlo1,-4($tp)          ; tp[j-1]
810
811          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
812          fcnvxf,dbl,dbl ${fab0},${fab0}
813         add             $hi1,$hi0,$hi0
814         addc            %r0,%r0,$hi1
815          fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
816         add             $ti0,$hi0,$hi0
817         addc            %r0,$hi1,$hi1
818          fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
819         stw             $hi0,0($tp)
820         stw             $hi1,4($tp)
821          xmpyu          ${fn0},${fab0}R,${fm0}
822
823         b               L\$outer_pa11
824         ldo             `$LOCALS+32+4`($fp),$tp
825 \f
826 L\$outerdone_pa11
827         add             $hi0,$ablo,$ablo
828         addc            %r0,$abhi,$abhi
829         add             $ti1,$ablo,$ablo
830         addc            %r0,$abhi,$hi0
831
832         ldw             4($tp),$ti0             ; tp[j]
833
834         add             $hi1,$nmlo1,$nmlo1
835         addc            %r0,$nmhi1,$nmhi1
836         add             $ablo,$nmlo1,$nmlo1
837         addc            %r0,$nmhi1,$hi1
838         stw             $nmlo1,-4($tp)          ; tp[j-1]
839
840         add             $hi1,$hi0,$hi0
841         addc            %r0,%r0,$hi1
842         add             $ti0,$hi0,$hi0
843         addc            %r0,$hi1,$hi1
844         stw             $hi0,0($tp)
845         stw             $hi1,4($tp)
846
847         ldo             `$LOCALS+32+4`($fp),$tp
848         sub             %r0,%r0,%r0             ; clear borrow
849         ldw             -4($tp),$ti0
850         addl            $tp,$arrsz,$tp
851 L\$sub_pa11
852         ldwx            $idx($np),$hi0
853         subb            $ti0,$hi0,$hi1
854         ldwx            $idx($tp),$ti0
855         addib,<>        4,$idx,L\$sub_pa11
856         stws,ma         $hi1,4($rp)
857
858         subb            $ti0,%r0,$hi1
859         ldo             -4($tp),$tp
860         and             $tp,$hi1,$ap
861         andcm           $rp,$hi1,$bp
862         or              $ap,$bp,$np
863
864         sub             $rp,$arrsz,$rp          ; rewind rp
865         subi            0,$arrsz,$idx
866         ldo             `$LOCALS+32`($fp),$tp
867 L\$copy_pa11
868         ldwx            $idx($np),$hi0
869         stws,ma         %r0,4($tp)
870         addib,<>        4,$idx,L\$copy_pa11
871         stws,ma         $hi0,4($rp)     
872
873         nop                                     ; alignment
874 L\$done
875 ___
876 }
877 \f
878 $code.=<<___;
879         ldi             1,%r28                  ; signal "handled"
880         ldo             $FRAME($fp),%sp         ; destroy tp[num+1]
881
882         $POP    `-$FRAME-$SAVED_RP`(%sp),%r2    ; standard epilogue
883         $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
884         $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
885         $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
886         $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
887         $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
888         $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
889         $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
890 L\$abort
891         bv      (%r2)
892         .EXIT
893         $POPMB  -$FRAME(%sp),%r3
894         .PROCEND
895         .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
896 ___
897 \f
898 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
899 # that it can be compiled with .LEVEL 1.0. It should be noted that I
900 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
901 # directive...
902
903 my $ldd = sub {
904   my ($mod,$args) = @_;
905   my $orig = "ldd$mod\t$args";
906
907     if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)         # format 4
908     {   my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
909         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
910     }
911     elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)     # format 5
912     {   my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
913         $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);                # encode offset
914         $opcode|=(1<<5)  if ($mod =~ /^,m/);
915         $opcode|=(1<<13) if ($mod =~ /^,mb/);
916         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
917     }
918     else { "\t".$orig; }
919 };
920
921 my $std = sub {
922   my ($mod,$args) = @_;
923   my $orig = "std$mod\t$args";
924
925     if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)        # format 6
926     {   my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
927         $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);                  # encode offset
928         $opcode|=(1<<5)  if ($mod =~ /^,m/);
929         $opcode|=(1<<13) if ($mod =~ /^,mb/);
930         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
931     }
932     else { "\t".$orig; }
933 };
934
935 my $extrd = sub {
936   my ($mod,$args) = @_;
937   my $orig = "extrd$mod\t$args";
938
939     # I only have ",u" completer, it's implicitly encoded...
940     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
941     {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
942         my $len=32-$3;
943         $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
944         $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
945         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
946     }
947     elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
948     {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
949         my $len=32-$2;
950         $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
951         $opcode |= (1<<13) if ($mod =~ /,\**=/);
952         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
953     }
954     else { "\t".$orig; }
955 };
956
957 my $shrpd = sub {
958   my ($mod,$args) = @_;
959   my $orig = "shrpd$mod\t$args";
960
961     if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
962     {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
963         my $cpos=63-$3;
964         $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
965         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
966     }
967     else { "\t".$orig; }
968 };
969
970 my $sub = sub {
971   my ($mod,$args) = @_;
972   my $orig = "sub$mod\t$args";
973
974     if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
975         my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
976         $opcode|=(1<<10);       # e1
977         $opcode|=(1<<8);        # e2
978         $opcode|=(1<<5);        # d
979         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
980     }
981     else { "\t".$orig; }
982 };
983
984 sub assemble {
985   my ($mnemonic,$mod,$args)=@_;
986   my $opcode = eval("\$$mnemonic");
987
988     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
989 }
990
991 foreach (split("\n",$code)) {
992         s/\`([^\`]*)\`/eval $1/ge;
993         # flip word order in 64-bit mode...
994         s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
995         # assemble 2.0 instructions in 32-bit mode...
996         s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
997
998         s/\bbv\b/bve/gm if ($SIZE_T==8);
999
1000         print $_,"\n";
1001 }
1002 close STDOUT;