78ab2dc30def2bf9a4d8d474d27ad95efd503b9d
[openssl.git] / crypto / bn / asm / parisc-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # On PA-7100LC this module performs ~90-50% better, less for longer
18 # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
19 # that compiler utilized xmpyu instruction to perform 32x32=64-bit
20 # multiplication, which in turn means that "baseline" performance was
21 # optimal in respect to instruction set capabilities. Fair comparison
22 # with vendor compiler is problematic, because OpenSSL doesn't define
23 # BN_LLONG [presumably] for historical reasons, which drives compiler
24 # toward 4 times 16x16=32-bit multiplications [plus complementary
25 # shifts and additions] instead. This means that you should observe
26 # several times improvement over code generated by vendor compiler
27 # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
28 # improvement coefficient was never collected on PA-7100LC, or any
29 # other 1.1 CPU, because I don't have access to such machine with
30 # vendor compiler. But to give you a taste, PA-RISC 1.1 code path
31 # reportedly outperformed code generated by cc +DA1.1 +O3 by factor
32 # of ~5x on PA-8600.
33 #
34 # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
35 # reportedly ~2x faster than vendor compiler generated code [according
36 # to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
37 # this implementation is actually 32-bit one, in the sense that it
38 # operates on 32-bit values. But pa-risc2[W].s operates on arrays of
39 # 64-bit BN_LONGs... How do they interoperate then? No problem. This
40 # module picks halves of 64-bit values in reverse order and pretends
41 # they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
42 # 64-bit code such as pa-risc2[W].s then? Well, the thing is that
43 # 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
44 # i.e. there is no "wider" multiplication like on most other 64-bit
45 # platforms. This means that even being effectively 32-bit, this
46 # implementation performs "64-bit" computational task in same amount
47 # of arithmetic operations, most notably multiplications. It requires
48 # more memory references, most notably to tp[num], but this doesn't
49 # seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
50 # 2.0 code path provides virtually same performance as pa-risc2[W].s:
51 # it's ~10% better for shortest key length and ~10% worse for longest
52 # one.
53 #
54 # In case it wasn't clear. The module has two distinct code paths:
55 # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
56 # additions and 64-bit integer loads, not to mention specific
57 # instruction scheduling. In 64-bit build naturally only 2.0 code path
58 # is assembled. In 32-bit application context both code paths are
59 # assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
60 # is taken automatically. Also, in 32-bit build the module imposes
61 # couple of limitations: vector lengths has to be even and vector
62 # addresses has to be 64-bit aligned. Normally neither is a problem:
63 # most common key lengths are even and vectors are commonly malloc-ed,
64 # which ensures alignment.
65 #
66 # Special thanks to polarhome.com for providing HP-UX account on
67 # PA-RISC 1.1 machine, and to correspondent who chose to remain
68 # anonymous for testing the code on PA-RISC 2.0 machine.
69 \f
70 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71
72 $flavour = shift;
73 $output = shift;
74
75 open STDOUT,">$output";
76
77 if ($flavour =~ /64/) {
78         $LEVEL          ="2.0W";
79         $SIZE_T         =8;
80         $FRAME_MARKER   =80;
81         $SAVED_RP       =16;
82         $PUSH           ="std";
83         $PUSHMA         ="std,ma";
84         $POP            ="ldd";
85         $POPMB          ="ldd,mb";
86         $BN_SZ          =$SIZE_T;
87 } else {
88         $LEVEL          ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
89         $SIZE_T         =4;
90         $FRAME_MARKER   =48;
91         $SAVED_RP       =20;
92         $PUSH           ="stw";
93         $PUSHMA         ="stwm";
94         $POP            ="ldw";
95         $POPMB          ="ldwm";
96         $BN_SZ          =$SIZE_T;
97         if (open CONF,"<${dir}../../opensslconf.h") {
98             while(<CONF>) {
99                 if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
100                     $BN_SZ=8;
101                     $LEVEL="2.0";
102                     last;
103                 }
104             }
105             close CONF;
106         }
107 }
108
109 $FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
110                                 #                [+ argument transfer]
111 $LOCALS=$FRAME-$FRAME_MARKER;
112 $FRAME+=32;                     # local variables
113
114 $tp="%r31";
115 $ti1="%r29";
116 $ti0="%r28";
117
118 $rp="%r26";
119 $ap="%r25";
120 $bp="%r24";
121 $np="%r23";
122 $n0="%r22";     # passed through stack in 32-bit
123 $num="%r21";    # passed through stack in 32-bit
124 $idx="%r20";
125 $arrsz="%r19";
126
127 $nm1="%r7";
128 $nm0="%r6";
129 $ab1="%r5";
130 $ab0="%r4";
131
132 $fp="%r3";
133 $hi1="%r2";
134 $hi0="%r1";
135
136 $xfer=$n0;      # accommodates [-16..15] offset in fld[dw]s
137
138 $fm0="%fr4";    $fti=$fm0;
139 $fbi="%fr5L";
140 $fn0="%fr5R";
141 $fai="%fr6";    $fab0="%fr7";   $fab1="%fr8";
142 $fni="%fr9";    $fnm0="%fr10";  $fnm1="%fr11";
143
144 $code=<<___;
145         .LEVEL  $LEVEL
146         .SPACE  \$TEXT\$
147         .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
148
149         .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
150         .ALIGN  64
151 bn_mul_mont
152         .PROC
153         .CALLINFO       FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
154         .ENTRY
155         $PUSH   %r2,-$SAVED_RP(%sp)             ; standard prologue
156         $PUSHMA %r3,$FRAME(%sp)
157         $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
158         $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
159         $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
160         $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
161         $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
162         $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
163         $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
164         ldo     -$FRAME(%sp),$fp
165 ___
166 $code.=<<___ if ($SIZE_T==4);
167         ldw     `-$FRAME_MARKER-4`($fp),$n0
168         ldw     `-$FRAME_MARKER-8`($fp),$num
169         nop
170         nop                                     ; alignment
171 ___
172 $code.=<<___ if ($BN_SZ==4);
173         comiclr,<=      6,$num,%r0              ; are vectors long enough?
174         b               L\$abort
175         ldi             0,%r28                  ; signal "unhandled"
176         add,ev          %r0,$num,$num           ; is $num even?
177         b               L\$abort
178         nop
179         or              $ap,$np,$ti1
180         extru,=         $ti1,31,3,%r0           ; are ap and np 64-bit aligned?
181         b               L\$abort
182         nop
183         nop                                     ; alignment
184         nop
185
186         fldws           0($n0),${fn0}
187         fldws,ma        4($bp),${fbi}           ; bp[0]
188 ___
189 $code.=<<___ if ($BN_SZ==8);
190         comib,>         3,$num,L\$abort         ; are vectors long enough?
191         ldi             0,%r28                  ; signal "unhandled"
192         addl            $num,$num,$num          ; I operate on 32-bit values
193
194         fldws           4($n0),${fn0}           ; only low part of n0
195         fldws           4($bp),${fbi}           ; bp[0] in flipped word order
196 ___
197 $code.=<<___;
198         fldds           0($ap),${fai}           ; ap[0,1]
199         fldds           0($np),${fni}           ; np[0,1]
200
201         sh2addl         $num,%r0,$arrsz
202         ldi             31,$hi0
203         ldo             36($arrsz),$hi1         ; space for tp[num+1]
204         andcm           $hi1,$hi0,$hi1          ; align
205         addl            $hi1,%sp,%sp
206         $PUSH           $fp,-$SIZE_T(%sp)
207
208         ldo             `$LOCALS+16`($fp),$xfer
209         ldo             `$LOCALS+32+4`($fp),$tp
210
211         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[0]*bp[0]
212         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[1]*bp[0]
213         xmpyu           ${fn0},${fab0}R,${fm0}
214
215         addl            $arrsz,$ap,$ap          ; point at the end
216         addl            $arrsz,$np,$np
217         subi            0,$arrsz,$idx           ; j=0
218         ldo             8($idx),$idx            ; j++++
219
220         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
221         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
222         fstds           ${fab0},-16($xfer)
223         fstds           ${fnm0},-8($xfer)
224         fstds           ${fab1},0($xfer)
225         fstds           ${fnm1},8($xfer)
226          flddx          $idx($ap),${fai}        ; ap[2,3]
227          flddx          $idx($np),${fni}        ; np[2,3]
228 ___
229 $code.=<<___ if ($BN_SZ==4);
230         mtctl           $hi0,%cr11              ; $hi0 still holds 31
231         extrd,u,*=      $hi0,%sar,1,$hi0        ; executes on PA-RISC 1.0
232         b               L\$parisc11
233         nop
234 ___
235 $code.=<<___;                                   # PA-RISC 2.0 code-path
236         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
237         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
238         ldd             -16($xfer),$ab0
239         fstds           ${fab0},-16($xfer)
240
241         extrd,u         $ab0,31,32,$hi0
242         extrd,u         $ab0,63,32,$ab0
243         ldd             -8($xfer),$nm0
244         fstds           ${fnm0},-8($xfer)
245          ldo            8($idx),$idx            ; j++++
246          addl           $ab0,$nm0,$nm0          ; low part is discarded
247          extrd,u        $nm0,31,32,$hi1
248 \f
249 L\$1st
250         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
251         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
252         ldd             0($xfer),$ab1
253         fstds           ${fab1},0($xfer)
254          addl           $hi0,$ab1,$ab1
255          extrd,u        $ab1,31,32,$hi0
256         ldd             8($xfer),$nm1
257         fstds           ${fnm1},8($xfer)
258          extrd,u        $ab1,63,32,$ab1
259          addl           $hi1,$nm1,$nm1
260         flddx           $idx($ap),${fai}        ; ap[j,j+1]
261         flddx           $idx($np),${fni}        ; np[j,j+1]
262          addl           $ab1,$nm1,$nm1
263          extrd,u        $nm1,31,32,$hi1
264
265         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
266         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
267         ldd             -16($xfer),$ab0
268         fstds           ${fab0},-16($xfer)
269          addl           $hi0,$ab0,$ab0
270          extrd,u        $ab0,31,32,$hi0
271         ldd             -8($xfer),$nm0
272         fstds           ${fnm0},-8($xfer)
273          extrd,u        $ab0,63,32,$ab0
274          addl           $hi1,$nm0,$nm0
275         stw             $nm1,-4($tp)            ; tp[j-1]
276          addl           $ab0,$nm0,$nm0
277          stw,ma         $nm0,8($tp)             ; tp[j-1]
278         addib,<>        8,$idx,L\$1st           ; j++++
279          extrd,u        $nm0,31,32,$hi1
280
281         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
282         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
283         ldd             0($xfer),$ab1
284         fstds           ${fab1},0($xfer)
285          addl           $hi0,$ab1,$ab1
286          extrd,u        $ab1,31,32,$hi0
287         ldd             8($xfer),$nm1
288         fstds           ${fnm1},8($xfer)
289          extrd,u        $ab1,63,32,$ab1
290          addl           $hi1,$nm1,$nm1
291         ldd             -16($xfer),$ab0
292          addl           $ab1,$nm1,$nm1
293         ldd             -8($xfer),$nm0
294          extrd,u        $nm1,31,32,$hi1
295
296          addl           $hi0,$ab0,$ab0
297          extrd,u        $ab0,31,32,$hi0
298         stw             $nm1,-4($tp)            ; tp[j-1]
299          extrd,u        $ab0,63,32,$ab0
300          addl           $hi1,$nm0,$nm0
301         ldd             0($xfer),$ab1
302          addl           $ab0,$nm0,$nm0
303         ldd,mb          8($xfer),$nm1
304          extrd,u        $nm0,31,32,$hi1
305         stw,ma          $nm0,8($tp)             ; tp[j-1]
306
307         ldo             -1($num),$num           ; i--
308         subi            0,$arrsz,$idx           ; j=0
309 ___
310 $code.=<<___ if ($BN_SZ==4);
311         fldws,ma        4($bp),${fbi}           ; bp[1]
312 ___
313 $code.=<<___ if ($BN_SZ==8);
314         fldws           0($bp),${fbi}           ; bp[1] in flipped word order
315 ___
316 $code.=<<___;
317          flddx          $idx($ap),${fai}        ; ap[0,1]
318          flddx          $idx($np),${fni}        ; np[0,1]
319          fldws          8($xfer),${fti}R        ; tp[0]
320         addl            $hi0,$ab1,$ab1
321          extrd,u        $ab1,31,32,$hi0
322          extrd,u        $ab1,63,32,$ab1
323          ldo            8($idx),$idx            ; j++++
324          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
325          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
326         addl            $hi1,$nm1,$nm1
327         addl            $ab1,$nm1,$nm1
328         extrd,u         $nm1,31,32,$hi1
329          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
330         stw             $nm1,-4($tp)            ; tp[j-1]
331
332          fcpy,sgl       %fr0,${fti}L            ; zero high part
333          fcpy,sgl       %fr0,${fab0}L
334         addl            $hi1,$hi0,$hi0
335         extrd,u         $hi0,31,32,$hi1
336          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
337          fcnvxf,dbl,dbl ${fab0},${fab0}
338         stw             $hi0,0($tp)
339         stw             $hi1,4($tp)
340
341         fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
342         fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
343         xmpyu           ${fn0},${fab0}R,${fm0}
344         ldo             `$LOCALS+32+4`($fp),$tp
345 L\$outer
346         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
347         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
348         fstds           ${fab0},-16($xfer)      ; 33-bit value
349         fstds           ${fnm0},-8($xfer)
350          flddx          $idx($ap),${fai}        ; ap[2]
351          flddx          $idx($np),${fni}        ; np[2]
352          ldo            8($idx),$idx            ; j++++
353         ldd             -16($xfer),$ab0         ; 33-bit value
354         ldd             -8($xfer),$nm0
355         ldw             0($xfer),$hi0           ; high part
356
357         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
358         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
359          extrd,u        $ab0,31,32,$ti0         ; carry bit
360          extrd,u        $ab0,63,32,$ab0
361         fstds           ${fab1},0($xfer)
362          addl           $ti0,$hi0,$hi0          ; account carry bit
363         fstds           ${fnm1},8($xfer)
364          addl           $ab0,$nm0,$nm0          ; low part is discarded
365         ldw             0($tp),$ti1             ; tp[1]
366          extrd,u        $nm0,31,32,$hi1
367         fstds           ${fab0},-16($xfer)
368         fstds           ${fnm0},-8($xfer)
369 \f
370 L\$inner
371         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
372         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
373         ldd             0($xfer),$ab1
374         fstds           ${fab1},0($xfer)
375          addl           $hi0,$ti1,$ti1
376          addl           $ti1,$ab1,$ab1
377         ldd             8($xfer),$nm1
378         fstds           ${fnm1},8($xfer)
379          extrd,u        $ab1,31,32,$hi0
380          extrd,u        $ab1,63,32,$ab1
381         flddx           $idx($ap),${fai}        ; ap[j,j+1]
382         flddx           $idx($np),${fni}        ; np[j,j+1]
383          addl           $hi1,$nm1,$nm1
384          addl           $ab1,$nm1,$nm1
385         ldw             4($tp),$ti0             ; tp[j]
386         stw             $nm1,-4($tp)            ; tp[j-1]
387
388         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
389         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
390         ldd             -16($xfer),$ab0
391         fstds           ${fab0},-16($xfer)
392          addl           $hi0,$ti0,$ti0
393          addl           $ti0,$ab0,$ab0
394         ldd             -8($xfer),$nm0
395         fstds           ${fnm0},-8($xfer)
396          extrd,u        $ab0,31,32,$hi0
397          extrd,u        $nm1,31,32,$hi1
398         ldw             8($tp),$ti1             ; tp[j]
399          extrd,u        $ab0,63,32,$ab0
400          addl           $hi1,$nm0,$nm0
401          addl           $ab0,$nm0,$nm0
402          stw,ma         $nm0,8($tp)             ; tp[j-1]
403         addib,<>        8,$idx,L\$inner         ; j++++
404          extrd,u        $nm0,31,32,$hi1
405
406         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
407         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
408         ldd             0($xfer),$ab1
409         fstds           ${fab1},0($xfer)
410          addl           $hi0,$ti1,$ti1
411          addl           $ti1,$ab1,$ab1
412         ldd             8($xfer),$nm1
413         fstds           ${fnm1},8($xfer)
414          extrd,u        $ab1,31,32,$hi0
415          extrd,u        $ab1,63,32,$ab1
416         ldw             4($tp),$ti0             ; tp[j]
417          addl           $hi1,$nm1,$nm1
418          addl           $ab1,$nm1,$nm1
419         ldd             -16($xfer),$ab0
420         ldd             -8($xfer),$nm0
421          extrd,u        $nm1,31,32,$hi1
422
423         addl            $hi0,$ab0,$ab0
424          addl           $ti0,$ab0,$ab0
425          stw            $nm1,-4($tp)            ; tp[j-1]
426          extrd,u        $ab0,31,32,$hi0
427         ldw             8($tp),$ti1             ; tp[j]
428          extrd,u        $ab0,63,32,$ab0
429          addl           $hi1,$nm0,$nm0
430         ldd             0($xfer),$ab1
431          addl           $ab0,$nm0,$nm0
432         ldd,mb          8($xfer),$nm1
433          extrd,u        $nm0,31,32,$hi1
434          stw,ma         $nm0,8($tp)             ; tp[j-1]
435
436         addib,=         -1,$num,L\$outerdone    ; i--
437         subi            0,$arrsz,$idx           ; j=0
438 ___
439 $code.=<<___ if ($BN_SZ==4);
440         fldws,ma        4($bp),${fbi}           ; bp[i]
441 ___
442 $code.=<<___ if ($BN_SZ==8);
443         ldi             12,$ti0                 ; bp[i] in flipped word order
444         addl,ev         %r0,$num,$num
445         ldi             -4,$ti0
446         addl            $ti0,$bp,$bp
447         fldws           0($bp),${fbi}
448 ___
449 $code.=<<___;
450          flddx          $idx($ap),${fai}        ; ap[0]
451         addl            $hi0,$ab1,$ab1
452          flddx          $idx($np),${fni}        ; np[0]
453          fldws          8($xfer),${fti}R        ; tp[0]
454         addl            $ti1,$ab1,$ab1
455         extrd,u         $ab1,31,32,$hi0
456         extrd,u         $ab1,63,32,$ab1
457
458          ldo            8($idx),$idx            ; j++++
459          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
460          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
461         ldw             4($tp),$ti0             ; tp[j]
462
463         addl            $hi1,$nm1,$nm1
464          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
465         addl            $ab1,$nm1,$nm1
466         extrd,u         $nm1,31,32,$hi1
467          fcpy,sgl       %fr0,${fti}L            ; zero high part
468          fcpy,sgl       %fr0,${fab0}L
469         stw             $nm1,-4($tp)            ; tp[j-1]
470
471          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
472          fcnvxf,dbl,dbl ${fab0},${fab0}
473         addl            $hi1,$hi0,$hi0
474          fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
475         addl            $ti0,$hi0,$hi0
476         extrd,u         $hi0,31,32,$hi1
477          fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
478         stw             $hi0,0($tp)
479         stw             $hi1,4($tp)
480          xmpyu          ${fn0},${fab0}R,${fm0}
481
482         b               L\$outer
483         ldo             `$LOCALS+32+4`($fp),$tp
484 \f
485 L\$outerdone
486         addl            $hi0,$ab1,$ab1
487         addl            $ti1,$ab1,$ab1
488         extrd,u         $ab1,31,32,$hi0
489         extrd,u         $ab1,63,32,$ab1
490
491         ldw             4($tp),$ti0             ; tp[j]
492
493         addl            $hi1,$nm1,$nm1
494         addl            $ab1,$nm1,$nm1
495         extrd,u         $nm1,31,32,$hi1
496         stw             $nm1,-4($tp)            ; tp[j-1]
497
498         addl            $hi1,$hi0,$hi0
499         addl            $ti0,$hi0,$hi0
500         extrd,u         $hi0,31,32,$hi1
501         stw             $hi0,0($tp)
502         stw             $hi1,4($tp)
503
504         ldo             `$LOCALS+32`($fp),$tp
505         sub             %r0,%r0,%r0             ; clear borrow
506 ___
507 $code.=<<___ if ($BN_SZ==4);
508         ldws,ma         4($tp),$ti0
509         extru,=         $rp,31,3,%r0            ; is rp 64-bit aligned?
510         b               L\$sub_pa11
511         addl            $tp,$arrsz,$tp
512 L\$sub
513         ldwx            $idx($np),$hi0
514         subb            $ti0,$hi0,$hi1
515         ldwx            $idx($tp),$ti0
516         addib,<>        4,$idx,L\$sub
517         stws,ma         $hi1,4($rp)
518
519         subb            $ti0,%r0,$hi1
520 ___
521 $code.=<<___ if ($BN_SZ==8);
522         ldd,ma          8($tp),$ti0
523 L\$sub
524         ldd             $idx($np),$hi0
525         shrpd           $ti0,$ti0,32,$ti0       ; flip word order
526         std             $ti0,-8($tp)            ; save flipped value
527         sub,db          $ti0,$hi0,$hi1
528         ldd,ma          8($tp),$ti0
529         addib,<>        8,$idx,L\$sub
530         std,ma          $hi1,8($rp)
531
532         extrd,u         $ti0,31,32,$ti0         ; carry in flipped word order
533         sub,db          $ti0,%r0,$hi1
534 ___
535 $code.=<<___;
536         ldo             `$LOCALS+32`($fp),$tp
537         sub             $rp,$arrsz,$rp          ; rewind rp
538         subi            0,$arrsz,$idx
539 L\$copy
540         ldd             0($tp),$ti0
541         ldd             0($rp),$hi0
542         std,ma          %r0,8($tp)
543         comiclr,=       0,$hi1,%r0
544         copy            $ti0,$hi0
545         addib,<>        8,$idx,L\$copy
546         std,ma          $hi0,8($rp)
547 ___
548
549 if ($BN_SZ==4) {                                # PA-RISC 1.1 code-path
550 $ablo=$ab0;
551 $abhi=$ab1;
552 $nmlo0=$nm0;
553 $nmhi0=$nm1;
554 $nmlo1="%r9";
555 $nmhi1="%r8";
556
557 $code.=<<___;
558         b               L\$done
559         nop
560
561         .ALIGN          8
562 L\$parisc11
563         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
564         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
565         ldw             -12($xfer),$ablo
566         ldw             -16($xfer),$hi0
567         ldw             -4($xfer),$nmlo0
568         ldw             -8($xfer),$nmhi0
569         fstds           ${fab0},-16($xfer)
570         fstds           ${fnm0},-8($xfer)
571
572          ldo            8($idx),$idx            ; j++++
573          add            $ablo,$nmlo0,$nmlo0     ; discarded
574          addc           %r0,$nmhi0,$hi1
575         ldw             4($xfer),$ablo
576         ldw             0($xfer),$abhi
577         nop
578 \f
579 L\$1st_pa11
580         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
581         flddx           $idx($ap),${fai}        ; ap[j,j+1]
582         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
583         flddx           $idx($np),${fni}        ; np[j,j+1]
584          add            $hi0,$ablo,$ablo
585         ldw             12($xfer),$nmlo1
586          addc           %r0,$abhi,$hi0
587         ldw             8($xfer),$nmhi1
588          add            $ablo,$nmlo1,$nmlo1
589         fstds           ${fab1},0($xfer)
590          addc           %r0,$nmhi1,$nmhi1
591         fstds           ${fnm1},8($xfer)
592          add            $hi1,$nmlo1,$nmlo1
593         ldw             -12($xfer),$ablo
594          addc           %r0,$nmhi1,$hi1
595         ldw             -16($xfer),$abhi
596
597         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
598         ldw             -4($xfer),$nmlo0
599         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
600         ldw             -8($xfer),$nmhi0
601          add            $hi0,$ablo,$ablo
602         stw             $nmlo1,-4($tp)          ; tp[j-1]
603          addc           %r0,$abhi,$hi0
604         fstds           ${fab0},-16($xfer)
605          add            $ablo,$nmlo0,$nmlo0
606         fstds           ${fnm0},-8($xfer)
607          addc           %r0,$nmhi0,$nmhi0
608         ldw             0($xfer),$abhi
609          add            $hi1,$nmlo0,$nmlo0
610         ldw             4($xfer),$ablo
611          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
612         addib,<>        8,$idx,L\$1st_pa11      ; j++++
613          addc           %r0,$nmhi0,$hi1
614
615          ldw            8($xfer),$nmhi1
616          ldw            12($xfer),$nmlo1
617         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
618         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
619          add            $hi0,$ablo,$ablo
620         fstds           ${fab1},0($xfer)
621          addc           %r0,$abhi,$hi0
622         fstds           ${fnm1},8($xfer)
623          add            $ablo,$nmlo1,$nmlo1
624         ldw             -16($xfer),$abhi
625          addc           %r0,$nmhi1,$nmhi1
626         ldw             -12($xfer),$ablo
627          add            $hi1,$nmlo1,$nmlo1
628         ldw             -8($xfer),$nmhi0
629          addc           %r0,$nmhi1,$hi1
630         ldw             -4($xfer),$nmlo0
631
632          add            $hi0,$ablo,$ablo
633         stw             $nmlo1,-4($tp)          ; tp[j-1]
634          addc           %r0,$abhi,$hi0
635         ldw             0($xfer),$abhi
636          add            $ablo,$nmlo0,$nmlo0
637         ldw             4($xfer),$ablo
638          addc           %r0,$nmhi0,$nmhi0
639         ldws,mb         8($xfer),$nmhi1
640          add            $hi1,$nmlo0,$nmlo0
641         ldw             4($xfer),$nmlo1
642          addc           %r0,$nmhi0,$hi1
643         stws,ma         $nmlo0,8($tp)           ; tp[j-1]
644
645         ldo             -1($num),$num           ; i--
646         subi            0,$arrsz,$idx           ; j=0
647
648          fldws,ma       4($bp),${fbi}           ; bp[1]
649          flddx          $idx($ap),${fai}        ; ap[0,1]
650          flddx          $idx($np),${fni}        ; np[0,1]
651          fldws          8($xfer),${fti}R        ; tp[0]
652         add             $hi0,$ablo,$ablo
653         addc            %r0,$abhi,$hi0
654          ldo            8($idx),$idx            ; j++++
655          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
656          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
657         add             $hi1,$nmlo1,$nmlo1
658         addc            %r0,$nmhi1,$nmhi1
659         add             $ablo,$nmlo1,$nmlo1
660         addc            %r0,$nmhi1,$hi1
661          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
662         stw             $nmlo1,-4($tp)          ; tp[j-1]
663
664          fcpy,sgl       %fr0,${fti}L            ; zero high part
665          fcpy,sgl       %fr0,${fab0}L
666         add             $hi1,$hi0,$hi0
667         addc            %r0,%r0,$hi1
668          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
669          fcnvxf,dbl,dbl ${fab0},${fab0}
670         stw             $hi0,0($tp)
671         stw             $hi1,4($tp)
672
673         fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
674         fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
675         xmpyu           ${fn0},${fab0}R,${fm0}
676         ldo             `$LOCALS+32+4`($fp),$tp
677 L\$outer_pa11
678         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
679         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
680         fstds           ${fab0},-16($xfer)      ; 33-bit value
681         fstds           ${fnm0},-8($xfer)
682          flddx          $idx($ap),${fai}        ; ap[2,3]
683          flddx          $idx($np),${fni}        ; np[2,3]
684         ldw             -16($xfer),$abhi        ; carry bit actually
685          ldo            8($idx),$idx            ; j++++
686         ldw             -12($xfer),$ablo
687         ldw             -8($xfer),$nmhi0
688         ldw             -4($xfer),$nmlo0
689         ldw             0($xfer),$hi0           ; high part
690
691         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
692         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
693         fstds           ${fab1},0($xfer)
694          addl           $abhi,$hi0,$hi0         ; account carry bit
695         fstds           ${fnm1},8($xfer)
696          add            $ablo,$nmlo0,$nmlo0     ; discarded
697         ldw             0($tp),$ti1             ; tp[1]
698          addc           %r0,$nmhi0,$hi1
699         fstds           ${fab0},-16($xfer)
700         fstds           ${fnm0},-8($xfer)
701         ldw             4($xfer),$ablo
702         ldw             0($xfer),$abhi
703 \f
704 L\$inner_pa11
705         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
706         flddx           $idx($ap),${fai}        ; ap[j,j+1]
707         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
708         flddx           $idx($np),${fni}        ; np[j,j+1]
709          add            $hi0,$ablo,$ablo
710         ldw             4($tp),$ti0             ; tp[j]
711          addc           %r0,$abhi,$abhi
712         ldw             12($xfer),$nmlo1
713          add            $ti1,$ablo,$ablo
714         ldw             8($xfer),$nmhi1
715          addc           %r0,$abhi,$hi0
716         fstds           ${fab1},0($xfer)
717          add            $ablo,$nmlo1,$nmlo1
718         fstds           ${fnm1},8($xfer)
719          addc           %r0,$nmhi1,$nmhi1
720         ldw             -12($xfer),$ablo
721          add            $hi1,$nmlo1,$nmlo1
722         ldw             -16($xfer),$abhi
723          addc           %r0,$nmhi1,$hi1
724
725         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
726         ldw             8($tp),$ti1             ; tp[j]
727         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
728         ldw             -4($xfer),$nmlo0
729          add            $hi0,$ablo,$ablo
730         ldw             -8($xfer),$nmhi0
731          addc           %r0,$abhi,$abhi
732         stw             $nmlo1,-4($tp)          ; tp[j-1]
733          add            $ti0,$ablo,$ablo
734         fstds           ${fab0},-16($xfer)
735          addc           %r0,$abhi,$hi0
736         fstds           ${fnm0},-8($xfer)
737          add            $ablo,$nmlo0,$nmlo0
738         ldw             4($xfer),$ablo
739          addc           %r0,$nmhi0,$nmhi0
740         ldw             0($xfer),$abhi
741          add            $hi1,$nmlo0,$nmlo0
742          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
743         addib,<>        8,$idx,L\$inner_pa11    ; j++++
744          addc           %r0,$nmhi0,$hi1
745
746         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
747         ldw             12($xfer),$nmlo1
748         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
749         ldw             8($xfer),$nmhi1
750          add            $hi0,$ablo,$ablo
751         ldw             4($tp),$ti0             ; tp[j]
752          addc           %r0,$abhi,$abhi
753         fstds           ${fab1},0($xfer)
754          add            $ti1,$ablo,$ablo
755         fstds           ${fnm1},8($xfer)
756          addc           %r0,$abhi,$hi0
757         ldw             -16($xfer),$abhi
758          add            $ablo,$nmlo1,$nmlo1
759         ldw             -12($xfer),$ablo
760          addc           %r0,$nmhi1,$nmhi1
761         ldw             -8($xfer),$nmhi0
762          add            $hi1,$nmlo1,$nmlo1
763         ldw             -4($xfer),$nmlo0
764          addc           %r0,$nmhi1,$hi1
765
766         add             $hi0,$ablo,$ablo
767          stw            $nmlo1,-4($tp)          ; tp[j-1]
768         addc            %r0,$abhi,$abhi
769          add            $ti0,$ablo,$ablo
770         ldw             8($tp),$ti1             ; tp[j]
771          addc           %r0,$abhi,$hi0
772         ldw             0($xfer),$abhi
773          add            $ablo,$nmlo0,$nmlo0
774         ldw             4($xfer),$ablo
775          addc           %r0,$nmhi0,$nmhi0
776         ldws,mb         8($xfer),$nmhi1
777          add            $hi1,$nmlo0,$nmlo0
778         ldw             4($xfer),$nmlo1
779          addc           %r0,$nmhi0,$hi1
780          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
781
782         addib,=         -1,$num,L\$outerdone_pa11; i--
783         subi            0,$arrsz,$idx           ; j=0
784
785          fldws,ma       4($bp),${fbi}           ; bp[i]
786          flddx          $idx($ap),${fai}        ; ap[0]
787         add             $hi0,$ablo,$ablo
788         addc            %r0,$abhi,$abhi
789          flddx          $idx($np),${fni}        ; np[0]
790          fldws          8($xfer),${fti}R        ; tp[0]
791         add             $ti1,$ablo,$ablo
792         addc            %r0,$abhi,$hi0
793
794          ldo            8($idx),$idx            ; j++++
795          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
796          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
797         ldw             4($tp),$ti0             ; tp[j]
798
799         add             $hi1,$nmlo1,$nmlo1
800         addc            %r0,$nmhi1,$nmhi1
801          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
802         add             $ablo,$nmlo1,$nmlo1
803         addc            %r0,$nmhi1,$hi1
804          fcpy,sgl       %fr0,${fti}L            ; zero high part
805          fcpy,sgl       %fr0,${fab0}L
806         stw             $nmlo1,-4($tp)          ; tp[j-1]
807
808          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
809          fcnvxf,dbl,dbl ${fab0},${fab0}
810         add             $hi1,$hi0,$hi0
811         addc            %r0,%r0,$hi1
812          fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
813         add             $ti0,$hi0,$hi0
814         addc            %r0,$hi1,$hi1
815          fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
816         stw             $hi0,0($tp)
817         stw             $hi1,4($tp)
818          xmpyu          ${fn0},${fab0}R,${fm0}
819
820         b               L\$outer_pa11
821         ldo             `$LOCALS+32+4`($fp),$tp
822 \f
823 L\$outerdone_pa11
824         add             $hi0,$ablo,$ablo
825         addc            %r0,$abhi,$abhi
826         add             $ti1,$ablo,$ablo
827         addc            %r0,$abhi,$hi0
828
829         ldw             4($tp),$ti0             ; tp[j]
830
831         add             $hi1,$nmlo1,$nmlo1
832         addc            %r0,$nmhi1,$nmhi1
833         add             $ablo,$nmlo1,$nmlo1
834         addc            %r0,$nmhi1,$hi1
835         stw             $nmlo1,-4($tp)          ; tp[j-1]
836
837         add             $hi1,$hi0,$hi0
838         addc            %r0,%r0,$hi1
839         add             $ti0,$hi0,$hi0
840         addc            %r0,$hi1,$hi1
841         stw             $hi0,0($tp)
842         stw             $hi1,4($tp)
843
844         ldo             `$LOCALS+32+4`($fp),$tp
845         sub             %r0,%r0,%r0             ; clear borrow
846         ldw             -4($tp),$ti0
847         addl            $tp,$arrsz,$tp
848 L\$sub_pa11
849         ldwx            $idx($np),$hi0
850         subb            $ti0,$hi0,$hi1
851         ldwx            $idx($tp),$ti0
852         addib,<>        4,$idx,L\$sub_pa11
853         stws,ma         $hi1,4($rp)
854
855         subb            $ti0,%r0,$hi1
856
857         ldo             `$LOCALS+32`($fp),$tp
858         sub             $rp,$arrsz,$rp          ; rewind rp
859         subi            0,$arrsz,$idx
860 L\$copy_pa11
861         ldw             0($tp),$ti0
862         ldw             0($rp),$hi0
863         stws,ma         %r0,4($tp)
864         comiclr,=       0,$hi1,%r0
865         copy            $ti0,$hi0
866         addib,<>        4,$idx,L\$copy_pa11
867         stws,ma         $hi0,4($rp)
868
869         nop                                     ; alignment
870 L\$done
871 ___
872 }
873 \f
874 $code.=<<___;
875         ldi             1,%r28                  ; signal "handled"
876         ldo             $FRAME($fp),%sp         ; destroy tp[num+1]
877
878         $POP    `-$FRAME-$SAVED_RP`(%sp),%r2    ; standard epilogue
879         $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
880         $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
881         $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
882         $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
883         $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
884         $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
885         $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
886 L\$abort
887         bv      (%r2)
888         .EXIT
889         $POPMB  -$FRAME(%sp),%r3
890         .PROCEND
891         .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
892 ___
893 \f
894 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
895 # that it can be compiled with .LEVEL 1.0. It should be noted that I
896 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
897 # directive...
898
899 my $ldd = sub {
900   my ($mod,$args) = @_;
901   my $orig = "ldd$mod\t$args";
902
903     if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)         # format 4
904     {   my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
905         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
906     }
907     elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)     # format 5
908     {   my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
909         $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);                # encode offset
910         $opcode|=(1<<5)  if ($mod =~ /^,m/);
911         $opcode|=(1<<13) if ($mod =~ /^,mb/);
912         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
913     }
914     else { "\t".$orig; }
915 };
916
917 my $std = sub {
918   my ($mod,$args) = @_;
919   my $orig = "std$mod\t$args";
920
921     if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)        # format 6
922     {   my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
923         $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);                  # encode offset
924         $opcode|=(1<<5)  if ($mod =~ /^,m/);
925         $opcode|=(1<<13) if ($mod =~ /^,mb/);
926         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
927     }
928     else { "\t".$orig; }
929 };
930
931 my $extrd = sub {
932   my ($mod,$args) = @_;
933   my $orig = "extrd$mod\t$args";
934
935     # I only have ",u" completer, it's implicitly encoded...
936     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
937     {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
938         my $len=32-$3;
939         $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
940         $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
941         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
942     }
943     elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
944     {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
945         my $len=32-$2;
946         $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
947         $opcode |= (1<<13) if ($mod =~ /,\**=/);
948         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
949     }
950     else { "\t".$orig; }
951 };
952
953 my $shrpd = sub {
954   my ($mod,$args) = @_;
955   my $orig = "shrpd$mod\t$args";
956
957     if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
958     {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
959         my $cpos=63-$3;
960         $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
961         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
962     }
963     else { "\t".$orig; }
964 };
965
966 my $sub = sub {
967   my ($mod,$args) = @_;
968   my $orig = "sub$mod\t$args";
969
970     if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
971         my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
972         $opcode|=(1<<10);       # e1
973         $opcode|=(1<<8);        # e2
974         $opcode|=(1<<5);        # d
975         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
976     }
977     else { "\t".$orig; }
978 };
979
980 sub assemble {
981   my ($mnemonic,$mod,$args)=@_;
982   my $opcode = eval("\$$mnemonic");
983
984     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
985 }
986
987 foreach (split("\n",$code)) {
988         s/\`([^\`]*)\`/eval $1/ge;
989         # flip word order in 64-bit mode...
990         s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
991         # assemble 2.0 instructions in 32-bit mode...
992         s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
993
994         s/\bbv\b/bve/gm if ($SIZE_T==8);
995
996         print $_,"\n";
997 }
998 close STDOUT;