crypto/bn/asm/parisc-mont.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # On PA-7100LC this module performs ~90-50% better, less for longer
  11 # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
  12 # that compiler utilized xmpyu instruction to perform 32x32=64-bit
  13 # multiplication, which in turn means that "baseline" performance was
  14 # optimal in respect to instruction set capabilities. Fair comparison
  15 # with vendor compiler is problematic, because OpenSSL doesn't define
  16 # BN_LLONG [presumably] for historical reasons, which drives compiler
  17 # toward 4 times 16x16=32-bit multiplicatons [plus complementary
  18 # shifts and additions] instead. This means that you should observe
  19 # several times improvement over code generated by vendor compiler
  20 # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
  21 # improvement coefficient was never collected on PA-7100LC, or any
  22 # other 1.1 CPU, because I don't have access to such machine with
  23 # vendor compiler. But to give you a taste, PA-RISC 1.1 code-path
  24 # reportedly outperformed code generated by cc +DA1.1 +O3 by factor
  25 # of ~5x on PA-8600.
  26 #
  27 # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
  28 # reportedly ~2x faster than vendor compiler generated code [see
  29 # commentary in assembler source code]. Here comes a catch. Execution
  30 # core of this implementation is actually 32-bit one, in the sense
  31 # that it expects arrays of 32-bit BN_LONG values as input. But
  32 # pa-risc2[W].s operates on arrays of 64-bit BN_LONGs... How do they
  33 # interoperate then? Simple. This module picks halves of 64-bit
  34 # values in reverse order. But can it compete with "pure" 64-bit code
  35 # such as pa-risc2[W].s then? Well, the thing is that 32x32=64-bit
  36 # multiplication is the best even PA-RISC 2.0 can do, i.e. there is
  37 # no "wider" multiplication like on most other 64-bit platforms.
  38 # This means that even being effectively 32-bit, this implementation
  39 # performs the same computational task in same amount of arithmetic
  40 # operations, most notably multiplications. It requires more memory
  41 # references, most notably to tp[num], but this doesn't seem to
  42 # exhaust memory port capacity. In other words this implementation,
  43 # or more specifically its PA-RISC 2.0 code-path, competes with
  44 # pa-risc2W.s on virtually same terms.
  45 #
  46 # In case it wasn't clear. The module has two distinct code-paths:
  47 # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
  48 # additions and 64-bit integer loads, not to mention specific
  49 # instruction scheduling. In 32-bit build module imposes couple of
  50 # limitations: vector lengths has to be even and vector addresses has
  51 # to be 64-bit aligned. Normally neither is a problem: most common
  52 # key lengths are even and vectors are commonly malloc-ed, which
  53 # ensures 64-bit alignment.
  54 #
  55 # Special thanks to polarhome.com for providing HP-UX account.
  56 \f
  57 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  58
  59 $flavour = shift;
  60 $output = shift;
  61
  62 open STDOUT,">$output";
  63
  64 if ($flavour =~ /64/) {
  65         $LEVEL          ="2.0W";
  66         $SIZE_T         =8;
  67         $FRAME_MARKER   =80;
  68         $SAVED_RP       =16;
  69         $PUSH           ="std";
  70         $PUSHMA         ="std,ma";
  71         $POP            ="ldd";
  72         $POPMB          ="ldd,mb";
  73         $BN_SZ          =$SIZE_T;
  74 } else {
  75         $LEVEL          ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
  76         $SIZE_T         =4;
  77         $FRAME_MARKER   =48;
  78         $SAVED_RP       =20;
  79         $PUSH           ="stw";
  80         $PUSHMA         ="stwm";
  81         $POP            ="ldw";
  82         $POPMB          ="ldwm";
  83         $BN_SZ          =$SIZE_T;
  84         if (open CONF,"<${dir}../../opensslconf.h") {
  85             while(<CONF>) {
  86                 if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
  87                     $BN_SZ=8;
  88                     $LEVEL="2.0";
  89                     last;
  90                 }
  91             }
  92             close CONF;
  93         }
  94 }
  95
  96 $FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
  97                                 #                [+ argument transfer]
  98 $LOCALS=$FRAME-$FRAME_MARKER;
  99 $FRAME+=32;                     # local variables
 100
 101 $tp="%r31";
 102 $ti1="%r29";
 103 $ti0="%r28";
 104
 105 $rp="%r26";
 106 $ap="%r25";
 107 $bp="%r24";
 108 $np="%r23";
 109 $n0="%r22";     # passed through stack in 32-bit
 110 $num="%r21";    # passed through stack in 32-bit
 111 $idx="%r20";
 112 $arrsz="%r19";
 113
 114 $nm1="%r7";
 115 $nm0="%r6";
 116 $ab1="%r5";
 117 $ab0="%r4";
 118
 119 $fp="%r3";
 120 $hi1="%r2";
 121 $hi0="%r1";
 122
 123 $xfer=$n0;      # accomodates [-16..15] offset in fld[dw]s
 124
 125 $fm0="%fr4";    $fti=$fm0;
 126 $fbi="%fr5L";
 127 $fn0="%fr5R";
 128 $fai="%fr6";    $fab0="%fr7";   $fab1="%fr8";
 129 $fni="%fr9";    $fnm0="%fr10";  $fnm1="%fr11";
 130
 131 $code=<<___;
 132         .LEVEL  $LEVEL
 133         .SPACE  \$TEXT\$
 134         .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
 135
 136         .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
 137         .ALIGN  16
 138 bn_mul_mont
 139         .PROC
 140         .CALLINFO       FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
 141         .ENTRY
 142         $PUSH   %r2,-$SAVED_RP(%sp)             ; standard prologue
 143         $PUSHMA %r3,$FRAME(%sp)
 144         $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
 145         $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
 146         $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
 147         $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
 148         $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
 149         $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
 150         $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
 151         ldo     -$FRAME(%sp),$fp
 152 ___
 153 $code.=<<___ if ($SIZE_T==4);
 154         ldw     `-$FRAME_MARKER-4`($fp),$n0
 155         ldw     `-$FRAME_MARKER-8`($fp),$num
 156         nop
 157         nop                                     ; alignment
 158 ___
 159 $code.=<<___ if ($BN_SZ==4);
 160         comiclr,<=      6,$num,%r0              ; are vectors long enough?
 161         b               L\$abort
 162         ldi             0,%r28                  ; signal "unhandled"
 163         add,ev          %r0,$num,$num           ; is $num even?
 164         b               L\$abort
 165         nop
 166         or              $ap,$np,$ti1
 167         extru,=         $ti1,31,3,%r0           ; are ap and np 64-bit aligned?
 168         b               L\$abort
 169         nop
 170         nop                                     ; alignment
 171
 172         fldws           0($n0),${fn0}
 173         fldws,ma        4($bp),${fbi}           ; bp[0]
 174 ___
 175 $code.=<<___ if ($BN_SZ==8);
 176         comib,>         3,$num,L\$abort         ; are vectors long enough?
 177         ldi             0,%r28                  ; signal "unhandled"
 178         addl            $num,$num,$num          ; I operate on 32-bit values
 179
 180         fldws           4($n0),${fn0}           ; only low part of n0
 181         fldws           4($bp),${fbi}           ; bp[0] in flipped word order
 182 ___
 183 $code.=<<___;
 184         fldds           0($ap),${fai}           ; ap[0,1]
 185         fldds           0($np),${fni}           ; np[0,1]
 186
 187         sh2addl         $num,%r0,$arrsz
 188         ldi             31,$hi0
 189         ldo             36($arrsz),$hi1         ; space for tp[num+1]
 190         andcm           $hi1,$hi0,$hi1          ; align
 191         addl            $hi1,%sp,%sp
 192         $PUSH           $fp,-$SIZE_T(%sp)
 193
 194         ldo             `$LOCALS+16`($fp),$xfer
 195         ldo             `$LOCALS+32+4`($fp),$tp
 196
 197         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[0]*bp[0]
 198         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[1]*bp[0]
 199         xmpyu           ${fn0},${fab0}R,${fm0}
 200
 201         addl            $arrsz,$ap,$ap          ; point at the end
 202         addl            $arrsz,$np,$np
 203         subi            0,$arrsz,$idx           ; j=0
 204         ldo             8($idx),$idx            ; j++++
 205
 206         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
 207         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
 208         fstds           ${fab0},-16($xfer)
 209         fstds           ${fnm0},-8($xfer)
 210         fstds           ${fab1},0($xfer)
 211         fstds           ${fnm1},8($xfer)
 212          flddx          $idx($ap),${fai}        ; ap[2,3]
 213          flddx          $idx($np),${fni}        ; np[2,3]
 214 ___
 215 $code.=<<___ if ($BN_SZ==4);
 216         mtctl           $hi0,%cr11              ; $hi0 still holds 31
 217         extrd,u,*=      $hi0,%sar,1,$hi0        ; executes on PA-RISC 1.0
 218         b               L\$parisc11
 219         nop
 220 ___
 221 $code.=<<___;                                   # PA-RISC 2.0 code-path
 222         ldd             -16($xfer),$ab0
 223         ldd             -8($xfer),$nm0
 224
 225         extrd,u         $ab0,31,32,$hi0
 226         extrd,u         $ab0,63,32,$ab0
 227          ldo            8($idx),$idx            ; j++++
 228          addl           $ab0,$nm0,$nm0          ; low part is discarded
 229          extrd,u        $nm0,31,32,$hi1
 230         ldd             0($xfer),$ab1
 231 \f
 232 L\$1st
 233         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
 234         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 235          ldd            8($xfer),$nm1
 236         fstds           ${fab0},-16($xfer)
 237         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
 238         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
 239         fstds           ${fnm0},-8($xfer)
 240          addl           $hi0,$ab1,$ab1
 241         fstds           ${fab1},0($xfer)
 242          extrd,u        $ab1,31,32,$hi0
 243         fstds           ${fnm1},8($xfer)
 244          extrd,u        $ab1,63,32,$ab1
 245          addl           $hi1,$nm1,$nm1
 246         ldd             -16($xfer),$ab0
 247          addl           $ab1,$nm1,$nm1
 248         ldd             -8($xfer),$nm0
 249          extrd,u        $nm1,31,32,$hi1
 250
 251         flddx           $idx($ap),${fai}        ; ap[j,j+1]
 252          addl           $hi0,$ab0,$ab0
 253         flddx           $idx($np),${fni}        ; np[j,j+1]
 254          extrd,u        $ab0,31,32,$hi0
 255         stw             $nm1,-4($tp)            ; tp[j-1]
 256          extrd,u        $ab0,63,32,$ab0
 257          addl           $hi1,$nm0,$nm0
 258          addl           $ab0,$nm0,$nm0
 259         ldd             0($xfer),$ab1
 260          stw,ma         $nm0,8($tp)             ; tp[j-1]
 261         addib,<>        8,$idx,L\$1st           ; j++++
 262          extrd,u        $nm0,31,32,$hi1
 263
 264         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
 265         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 266          ldd            8($xfer),$nm1
 267         fstds           ${fab0},-16($xfer)
 268         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
 269         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
 270         fstds           ${fnm0},-8($xfer)
 271          addl           $hi0,$ab1,$ab1
 272         fstds           ${fab1},0($xfer)
 273          extrd,u        $ab1,31,32,$hi0
 274         fstds           ${fnm1},8($xfer)
 275          extrd,u        $ab1,63,32,$ab1
 276          addl           $hi1,$nm1,$nm1
 277         ldd             -16($xfer),$ab0
 278          addl           $ab1,$nm1,$nm1
 279         ldd             -8($xfer),$nm0
 280          extrd,u        $nm1,31,32,$hi1
 281
 282          addl           $hi0,$ab0,$ab0
 283          extrd,u        $ab0,31,32,$hi0
 284         stw             $nm1,-4($tp)            ; tp[j-1]
 285          extrd,u        $ab0,63,32,$ab0
 286          addl           $hi1,$nm0,$nm0
 287         ldd             0($xfer),$ab1
 288          addl           $ab0,$nm0,$nm0
 289         ldd,mb          8($xfer),$nm1
 290          extrd,u        $nm0,31,32,$hi1
 291         stw,ma          $nm0,8($tp)             ; tp[j-1]
 292
 293         ldo             -1($num),$num           ; i--
 294         subi            0,$arrsz,$idx           ; j=0
 295 ___
 296 $code.=<<___ if ($BN_SZ==4);
 297         fldws,ma        4($bp),${fbi}           ; bp[1]
 298 ___
 299 $code.=<<___ if ($BN_SZ==8);
 300         fldws           0($bp),${fbi}           ; bp[1] in flipped word order
 301 ___
 302 $code.=<<___;
 303          flddx          $idx($ap),${fai}        ; ap[0,1]
 304          flddx          $idx($np),${fni}        ; np[0,1]
 305          fldws          8($xfer),${fti}R        ; tp[0]
 306         addl            $hi0,$ab1,$ab1
 307          extrd,u        $ab1,31,32,$hi0
 308          extrd,u        $ab1,63,32,$ab1
 309          ldo            8($idx),$idx            ; j++++
 310          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
 311          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
 312         addl            $hi1,$nm1,$nm1
 313         addl            $ab1,$nm1,$nm1
 314         extrd,u         $nm1,31,32,$hi1
 315          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
 316         stw             $nm1,-4($tp)            ; tp[j-1]
 317
 318          fcpy,sgl       %fr0,${fti}L            ; zero high part
 319          fcpy,sgl       %fr0,${fab0}L
 320         addl            $hi1,$hi0,$hi0
 321         extrd,u         $hi0,31,32,$hi1
 322          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
 323          fcnvxf,dbl,dbl ${fab0},${fab0}
 324         stw             $hi0,0($tp)
 325         stw             $hi1,4($tp)
 326
 327         fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
 328         fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
 329         xmpyu           ${fn0},${fab0}R,${fm0}
 330         ldo             `$LOCALS+32+4`($fp),$tp
 331 L\$outer
 332         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
 333         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
 334         fstds           ${fab0},-16($xfer)      ; 33-bit value
 335         fstds           ${fnm0},-8($xfer)
 336          flddx          $idx($ap),${fai}        ; ap[2]
 337          flddx          $idx($np),${fni}        ; np[2]
 338          ldo            8($idx),$idx            ; j++++
 339         ldd             -16($xfer),$ab0         ; 33-bit value
 340         ldd             -8($xfer),$nm0
 341         ldw             0($xfer),$hi0           ; high part
 342
 343          extrd,u        $ab0,31,32,$ti0         ; carry bit
 344          extrd,u        $ab0,63,32,$ab0
 345         fstds           ${fab1},0($xfer)
 346          addl           $ti0,$hi0,$hi0          ; account carry bit
 347         fstds           ${fnm1},8($xfer)
 348          addl           $ab0,$nm0,$nm0          ; low part is discarded
 349         ldw             0($tp),$ti1             ; tp[1]
 350          extrd,u        $nm0,31,32,$hi1
 351         ldd             0($xfer),$ab1
 352 \f
 353 L\$inner
 354         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
 355         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 356          ldd            8($xfer),$nm1
 357         fstds           ${fab0},-16($xfer)
 358         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
 359         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
 360         fstds           ${fnm0},-8($xfer)
 361         ldw             4($tp),$ti0             ; tp[j]
 362          addl           $hi0,$ab1,$ab1
 363         fstds           ${fab1},0($xfer)
 364          addl           $ti1,$ab1,$ab1
 365          extrd,u        $ab1,31,32,$hi0
 366         fstds           ${fnm1},8($xfer)
 367          extrd,u        $ab1,63,32,$ab1
 368          addl           $hi1,$nm1,$nm1
 369         ldd             -16($xfer),$ab0
 370          addl           $ab1,$nm1,$nm1
 371         ldd             -8($xfer),$nm0
 372          extrd,u        $nm1,31,32,$hi1
 373
 374         flddx           $idx($ap),${fai}        ; ap[j,j+1]
 375          addl           $hi0,$ab0,$ab0
 376         flddx           $idx($np),${fni}        ; np[j,j+1]
 377          addl           $ti0,$ab0,$ab0
 378         stw             $nm1,-4($tp)            ; tp[j-1]
 379          extrd,u        $ab0,31,32,$hi0
 380         ldw             8($tp),$ti1             ; tp[j]
 381          extrd,u        $ab0,63,32,$ab0
 382          addl           $hi1,$nm0,$nm0
 383          addl           $ab0,$nm0,$nm0
 384         ldd             0($xfer),$ab1
 385          stw,ma         $nm0,8($tp)             ; tp[j-1]
 386         addib,<>        8,$idx,L\$inner         ; j++++
 387          extrd,u        $nm0,31,32,$hi1
 388
 389         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
 390         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 391          ldd            8($xfer),$nm1
 392         fstds           ${fab0},-16($xfer)
 393         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
 394         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
 395         fstds           ${fnm0},-8($xfer)
 396         ldw             4($tp),$ti0             ; tp[j]
 397          addl           $hi0,$ab1,$ab1
 398         fstds           ${fab1},0($xfer)
 399          addl           $ti1,$ab1,$ab1
 400          extrd,u        $ab1,31,32,$hi0
 401         fstds           ${fnm1},8($xfer)
 402          extrd,u        $ab1,63,32,$ab1
 403          addl           $hi1,$nm1,$nm1
 404         ldd             -16($xfer),$ab0
 405          addl           $ab1,$nm1,$nm1
 406         ldd             -8($xfer),$nm0
 407          extrd,u        $nm1,31,32,$hi1
 408
 409         addl            $hi0,$ab0,$ab0
 410          addl           $ti0,$ab0,$ab0
 411          stw            $nm1,-4($tp)            ; tp[j-1]
 412          extrd,u        $ab0,31,32,$hi0
 413         ldw             8($tp),$ti1             ; tp[j]
 414          extrd,u        $ab0,63,32,$ab0
 415          addl           $hi1,$nm0,$nm0
 416         ldd             0($xfer),$ab1
 417          addl           $ab0,$nm0,$nm0
 418         ldd,mb          8($xfer),$nm1
 419          extrd,u        $nm0,31,32,$hi1
 420          stw,ma         $nm0,8($tp)             ; tp[j-1]
 421
 422         addib,=         -1,$num,L\$outerdone    ; i--
 423         subi            0,$arrsz,$idx           ; j=0
 424 ___
 425 $code.=<<___ if ($BN_SZ==4);
 426         fldws,ma        4($bp),${fbi}           ; bp[i]
 427 ___
 428 $code.=<<___ if ($BN_SZ==8);
 429         ldi             12,$ti0                 ; bp[i] in flipped word order
 430         addl,ev         %r0,$num,$num
 431         ldi             -4,$ti0
 432         addl            $ti0,$bp,$bp
 433         fldws           0($bp),${fbi}
 434 ___
 435 $code.=<<___;
 436          flddx          $idx($ap),${fai}        ; ap[0]
 437         addl            $hi0,$ab1,$ab1
 438          flddx          $idx($np),${fni}        ; np[0]
 439          fldws          8($xfer),${fti}R        ; tp[0]
 440         addl            $ti1,$ab1,$ab1
 441         extrd,u         $ab1,31,32,$hi0
 442         extrd,u         $ab1,63,32,$ab1
 443
 444          ldo            8($idx),$idx            ; j++++
 445          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
 446          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
 447         ldw             4($tp),$ti0             ; tp[j]
 448
 449         addl            $hi1,$nm1,$nm1
 450          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
 451         addl            $ab1,$nm1,$nm1
 452         extrd,u         $nm1,31,32,$hi1
 453          fcpy,sgl       %fr0,${fti}L            ; zero high part
 454          fcpy,sgl       %fr0,${fab0}L
 455         stw             $nm1,-4($tp)            ; tp[j-1]
 456
 457          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
 458          fcnvxf,dbl,dbl ${fab0},${fab0}
 459         addl            $hi1,$hi0,$hi0
 460          fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
 461         addl            $ti0,$hi0,$hi0
 462         extrd,u         $hi0,31,32,$hi1
 463          fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
 464         stw             $hi0,0($tp)
 465         stw             $hi1,4($tp)
 466          xmpyu          ${fn0},${fab0}R,${fm0}
 467
 468         b               L\$outer
 469         ldo             `$LOCALS+32+4`($fp),$tp
 470 \f
 471 L\$outerdone
 472         addl            $hi0,$ab1,$ab1
 473         addl            $ti1,$ab1,$ab1
 474         extrd,u         $ab1,31,32,$hi0
 475         extrd,u         $ab1,63,32,$ab1
 476
 477         ldw             4($tp),$ti0             ; tp[j]
 478
 479         addl            $hi1,$nm1,$nm1
 480         addl            $ab1,$nm1,$nm1
 481         extrd,u         $nm1,31,32,$hi1
 482         stw             $nm1,-4($tp)            ; tp[j-1]
 483
 484         addl            $hi1,$hi0,$hi0
 485         addl            $ti0,$hi0,$hi0
 486         extrd,u         $hi0,31,32,$hi1
 487         stw             $hi0,0($tp)
 488         stw             $hi1,4($tp)
 489
 490         ldo             `$LOCALS+32`($fp),$tp
 491         sub             %r0,%r0,%r0             ; clear borrow
 492 ___
 493 $code.=<<___ if ($BN_SZ==4);
 494         ldws,ma         4($tp),$ti0
 495         extru,=         $rp,31,3,%r0            ; is rp 64-bit aligned?
 496         b               L\$sub_pa11
 497         addl            $tp,$arrsz,$tp
 498 L\$sub
 499         ldwx            $idx($np),$hi0
 500         subb            $ti0,$hi0,$hi1
 501         ldwx            $idx($tp),$ti0
 502         addib,<>        4,$idx,L\$sub
 503         stws,ma         $hi1,4($rp)
 504
 505         subb            $ti0,%r0,$hi1
 506         ldo             -4($tp),$tp
 507 ___
 508 $code.=<<___ if ($BN_SZ==8);
 509         ldd,ma          8($tp),$ti0
 510 L\$sub
 511         ldd             $idx($np),$hi0
 512         shrpd           $ti0,$ti0,32,$ti0       ; flip word order
 513         std             $ti0,-8($tp)            ; save flipped value
 514         sub,db          $ti0,$hi0,$hi1
 515         ldd,ma          8($tp),$ti0
 516         addib,<>        8,$idx,L\$sub
 517         std,ma          $hi1,8($rp)
 518
 519         extrd,u         $ti0,31,32,$ti0         ; carry in flipped word order
 520         sub,db          $ti0,%r0,$hi1
 521         ldo             -8($tp),$tp
 522 ___
 523 $code.=<<___;
 524         and             $tp,$hi1,$ap
 525         andcm           $rp,$hi1,$bp
 526         or              $ap,$bp,$np
 527
 528         sub             $rp,$arrsz,$rp          ; rewind rp
 529         subi            0,$arrsz,$idx
 530         ldo             `$LOCALS+32`($fp),$tp
 531 L\$copy
 532         ldd             $idx($np),$hi0
 533         std,ma          %r0,8($tp)
 534         addib,<>        8,$idx,.-8              ; L\$copy
 535         std,ma          $hi0,8($rp)
 536 ___
 537
 538 if ($BN_SZ==4) {                                # PA-RISC 1.1 code-path
 539 $ablo=$ab0;
 540 $abhi=$ab1;
 541 $nmlo0=$nm0;
 542 $nmhi0=$nm1;
 543 $nmlo1="%r9";
 544 $nmhi1="%r8";
 545
 546 $code.=<<___;
 547         b               L\$done
 548         nop
 549
 550         .ALIGN          8
 551 L\$parisc11
 552         ldw             -16($xfer),$hi0
 553         ldw             -12($xfer),$ablo
 554         ldw             -8($xfer),$nmhi0
 555         ldw             -4($xfer),$nmlo0
 556
 557          ldo            8($idx),$idx            ; j++++
 558          add            $ablo,$nmlo0,$nmlo0     ; discarded
 559          addc           %r0,$nmhi0,$hi1
 560         ldw             0($xfer),$abhi
 561         ldw             4($xfer),$ablo
 562         nop
 563 \f
 564 L\$1st_pa11
 565         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
 566          ldw            8($xfer),$nmhi1
 567         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 568          ldw            12($xfer),$nmlo1
 569         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
 570         fstds           ${fab0},-16($xfer)
 571         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
 572         fstds           ${fnm0},-8($xfer)
 573          add            $hi0,$ablo,$ablo
 574         fstds           ${fab1},0($xfer)
 575          addc           %r0,$abhi,$hi0
 576         fstds           ${fnm1},8($xfer)
 577          add            $ablo,$nmlo1,$nmlo1
 578         ldw             -16($xfer),$abhi
 579          addc           %r0,$nmhi1,$nmhi1
 580         ldw             -12($xfer),$ablo
 581          add            $hi1,$nmlo1,$nmlo1
 582         ldw             -8($xfer),$nmhi0
 583          addc           %r0,$nmhi1,$hi1
 584         ldw             -4($xfer),$nmlo0
 585
 586          add            $hi0,$ablo,$ablo
 587         flddx           $idx($ap),${fai}        ; ap[j,j+1]
 588          addc           %r0,$abhi,$hi0
 589         flddx           $idx($np),${fni}        ; np[j,j+1]
 590          add            $ablo,$nmlo0,$nmlo0
 591         stw             $nmlo1,-4($tp)          ; tp[j-1]
 592          addc           %r0,$nmhi0,$nmhi0
 593         ldw             0($xfer),$abhi
 594          add            $hi1,$nmlo0,$nmlo0
 595         ldw             4($xfer),$ablo
 596          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
 597         addib,<>        8,$idx,L\$1st_pa11      ; j++++
 598          addc           %r0,$nmhi0,$hi1
 599
 600         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
 601          ldw            8($xfer),$nmhi1
 602         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 603          ldw            12($xfer),$nmlo1
 604         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
 605         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
 606         fstds           ${fab0},-16($xfer)
 607         fstds           ${fnm0},-8($xfer)
 608          add            $hi0,$ablo,$ablo
 609         fstds           ${fab1},0($xfer)
 610          addc           %r0,$abhi,$hi0
 611         fstds           ${fnm1},8($xfer)
 612          add            $ablo,$nmlo1,$nmlo1
 613         ldw             -16($xfer),$abhi
 614          addc           %r0,$nmhi1,$nmhi1
 615         ldw             -12($xfer),$ablo
 616          add            $hi1,$nmlo1,$nmlo1
 617         ldw             -8($xfer),$nmhi0
 618          addc           %r0,$nmhi1,$hi1
 619         ldw             -4($xfer),$nmlo0
 620
 621          add            $hi0,$ablo,$ablo
 622         stw             $nmlo1,-4($tp)          ; tp[j-1]
 623          addc           %r0,$abhi,$hi0
 624         ldw             0($xfer),$abhi
 625          add            $ablo,$nmlo0,$nmlo0
 626         ldw             4($xfer),$ablo
 627          addc           %r0,$nmhi0,$nmhi0
 628         ldws,mb         8($xfer),$nmhi1
 629          add            $hi1,$nmlo0,$nmlo0
 630         ldw             4($xfer),$nmlo1
 631          addc           %r0,$nmhi0,$hi1
 632         stws,ma         $nmlo0,8($tp)           ; tp[j-1]
 633
 634         ldo             -1($num),$num           ; i--
 635         subi            0,$arrsz,$idx           ; j=0
 636
 637          fldws,ma       4($bp),${fbi}           ; bp[1]
 638          flddx          $idx($ap),${fai}        ; ap[0,1]
 639          flddx          $idx($np),${fni}        ; np[0,1]
 640          fldws          8($xfer),${fti}R        ; tp[0]
 641         add             $hi0,$ablo,$ablo
 642         addc            %r0,$abhi,$hi0
 643          ldo            8($idx),$idx            ; j++++
 644          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
 645          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
 646         add             $hi1,$nmlo1,$nmlo1
 647         addc            %r0,$nmhi1,$nmhi1
 648         add             $ablo,$nmlo1,$nmlo1
 649         addc            %r0,$nmhi1,$hi1
 650          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
 651         stw             $nmlo1,-4($tp)          ; tp[j-1]
 652
 653          fcpy,sgl       %fr0,${fti}L            ; zero high part
 654          fcpy,sgl       %fr0,${fab0}L
 655         add             $hi1,$hi0,$hi0
 656         addc            %r0,%r0,$hi1
 657          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
 658          fcnvxf,dbl,dbl ${fab0},${fab0}
 659         stw             $hi0,0($tp)
 660         stw             $hi1,4($tp)
 661
 662         fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
 663         fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
 664         xmpyu           ${fn0},${fab0}R,${fm0}
 665         ldo             `$LOCALS+32+4`($fp),$tp
 666 L\$outer_pa11
 667         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
 668         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
 669         fstds           ${fab0},-16($xfer)      ; 33-bit value
 670         fstds           ${fnm0},-8($xfer)
 671          flddx          $idx($ap),${fai}        ; ap[2,3]
 672          flddx          $idx($np),${fni}        ; np[2,3]
 673         ldw             -16($xfer),$abhi        ; carry bit actually
 674          ldo            8($idx),$idx            ; j++++
 675         ldw             -12($xfer),$ablo
 676         ldw             -8($xfer),$nmhi0
 677         ldw             -4($xfer),$nmlo0
 678         ldw             0($xfer),$hi0           ; high part
 679
 680         fstds           ${fab1},0($xfer)
 681          addl           $abhi,$hi0,$hi0         ; account carry bit
 682         fstds           ${fnm1},8($xfer)
 683          add            $ablo,$nmlo0,$nmlo0     ; discarded
 684         ldw             0($tp),$ti1             ; tp[1]
 685          addc           %r0,$nmhi0,$hi1
 686         ldw             0($xfer),$abhi
 687         ldw             4($xfer),$ablo
 688 \f
 689 L\$inner_pa11
 690         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
 691          ldw            8($xfer),$nmhi1
 692         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 693          ldw            12($xfer),$nmlo1
 694         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
 695         fstds           ${fab0},-16($xfer)
 696         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
 697         fstds           ${fnm0},-8($xfer)
 698          add            $hi0,$ablo,$ablo
 699         ldw             4($tp),$ti0             ; tp[j]
 700          addc           %r0,$abhi,$abhi
 701         fstds           ${fab1},0($xfer)
 702          add            $ti1,$ablo,$ablo
 703         fstds           ${fnm1},8($xfer)
 704          addc           %r0,$abhi,$hi0
 705         ldw             -16($xfer),$abhi
 706          add            $ablo,$nmlo1,$nmlo1
 707         ldw             -12($xfer),$ablo
 708          addc           %r0,$nmhi1,$nmhi1
 709         ldw             -8($xfer),$nmhi0
 710          add            $hi1,$nmlo1,$nmlo1
 711         ldw             -4($xfer),$nmlo0
 712          addc           %r0,$nmhi1,$hi1
 713
 714         flddx           $idx($ap),${fai}        ; ap[j,j+1]
 715          addl,nuv       $hi0,$ablo,$ablo
 716          addi           1,$abhi,$abhi
 717         flddx           $idx($np),${fni}        ; np[j,j+1]
 718          add            $ti0,$ablo,$ablo
 719         stw             $nmlo1,-4($tp)          ; tp[j-1]
 720          addc           %r0,$abhi,$hi0
 721         ldw             8($tp),$ti1             ; tp[j]
 722          addl,nuv       $ablo,$nmlo0,$nmlo0
 723          addi           1,$nmhi0,$nmhi0
 724         ldw             0($xfer),$abhi
 725          add            $hi1,$nmlo0,$nmlo0
 726         ldw             4($xfer),$ablo
 727          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
 728         addib,<>        8,$idx,L\$inner_pa11    ; j++++
 729          addc           %r0,$nmhi0,$hi1
 730
 731         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
 732          ldw            8($xfer),$nmhi1
 733         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 734          ldw            12($xfer),$nmlo1
 735         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
 736         fstds           ${fab0},-16($xfer)
 737         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
 738         fstds           ${fnm0},-8($xfer)
 739          add            $hi0,$ablo,$ablo
 740         ldw             4($tp),$ti0             ; tp[j]
 741          addc           %r0,$abhi,$abhi
 742         fstds           ${fab1},0($xfer)
 743          add            $ti1,$ablo,$ablo
 744         fstds           ${fnm1},8($xfer)
 745          addc           %r0,$abhi,$hi0
 746         ldw             -16($xfer),$abhi
 747          add            $ablo,$nmlo1,$nmlo1
 748         ldw             -12($xfer),$ablo
 749          addc           %r0,$nmhi1,$nmhi1
 750         ldw             -8($xfer),$nmhi0
 751          add            $hi1,$nmlo1,$nmlo1
 752         ldw             -4($xfer),$nmlo0
 753          addc           %r0,$nmhi1,$hi1
 754
 755         add             $hi0,$ablo,$ablo
 756          stw            $nmlo1,-4($tp)          ; tp[j-1]
 757         addc            %r0,$abhi,$abhi
 758          add            $ti0,$ablo,$ablo
 759         ldw             8($tp),$ti1             ; tp[j]
 760          addc           %r0,$abhi,$hi0
 761         ldw             0($xfer),$abhi
 762          add            $ablo,$nmlo0,$nmlo0
 763         ldw             4($xfer),$ablo
 764          addc           %r0,$nmhi0,$nmhi0
 765         ldws,mb         8($xfer),$nmhi1
 766          add            $hi1,$nmlo0,$nmlo0
 767         ldw             4($xfer),$nmlo1
 768          addc           %r0,$nmhi0,$hi1
 769          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
 770
 771         addib,=         -1,$num,L\$outerdone_pa11; i--
 772         subi            0,$arrsz,$idx           ; j=0
 773
 774          fldws,ma       4($bp),${fbi}           ; bp[i]
 775          flddx          $idx($ap),${fai}        ; ap[0]
 776         add             $hi0,$ablo,$ablo
 777         addc            %r0,$abhi,$abhi
 778          flddx          $idx($np),${fni}        ; np[0]
 779          fldws          8($xfer),${fti}R        ; tp[0]
 780         add             $ti1,$ablo,$ablo
 781         addc            %r0,$abhi,$hi0
 782
 783          ldo            8($idx),$idx            ; j++++
 784          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
 785          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
 786         ldw             4($tp),$ti0             ; tp[j]
 787
 788         add             $hi1,$nmlo1,$nmlo1
 789         addc            %r0,$nmhi1,$nmhi1
 790          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
 791         add             $ablo,$nmlo1,$nmlo1
 792         addc            %r0,$nmhi1,$hi1
 793          fcpy,sgl       %fr0,${fti}L            ; zero high part
 794          fcpy,sgl       %fr0,${fab0}L
 795         stw             $nmlo1,-4($tp)          ; tp[j-1]
 796
 797          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
 798          fcnvxf,dbl,dbl ${fab0},${fab0}
 799         add             $hi1,$hi0,$hi0
 800         addc            %r0,%r0,$hi1
 801          fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
 802         add             $ti0,$hi0,$hi0
 803         addc            %r0,$hi1,$hi1
 804          fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
 805         stw             $hi0,0($tp)
 806         stw             $hi1,4($tp)
 807          xmpyu          ${fn0},${fab0}R,${fm0}
 808
 809         b               L\$outer_pa11
 810         ldo             `$LOCALS+32+4`($fp),$tp
 811 \f
 812 L\$outerdone_pa11
 813         add             $hi0,$ablo,$ablo
 814         addc            %r0,$abhi,$abhi
 815         add             $ti1,$ablo,$ablo
 816         addc            %r0,$abhi,$hi0
 817
 818         ldw             4($tp),$ti0             ; tp[j]
 819
 820         add             $hi1,$nmlo1,$nmlo1
 821         addc            %r0,$nmhi1,$nmhi1
 822         add             $ablo,$nmlo1,$nmlo1
 823         addc            %r0,$nmhi1,$hi1
 824         stw             $nmlo1,-4($tp)          ; tp[j-1]
 825
 826         add             $hi1,$hi0,$hi0
 827         addc            %r0,%r0,$hi1
 828         add             $ti0,$hi0,$hi0
 829         addc            %r0,$hi1,$hi1
 830         stw             $hi0,0($tp)
 831         stw             $hi1,4($tp)
 832
 833         ldo             `$LOCALS+32+4`($fp),$tp
 834         sub             %r0,%r0,%r0             ; clear borrow
 835         ldw             -4($tp),$ti0
 836         addl            $tp,$arrsz,$tp
 837 L\$sub_pa11
 838         ldwx            $idx($np),$hi0
 839         subb            $ti0,$hi0,$hi1
 840         ldwx            $idx($tp),$ti0
 841         addib,<>        4,$idx,L\$sub_pa11
 842         stws,ma         $hi1,4($rp)
 843
 844         subb            $ti0,%r0,$hi1
 845         ldo             -4($tp),$tp
 846         and             $tp,$hi1,$ap
 847         andcm           $rp,$hi1,$bp
 848         or              $ap,$bp,$np
 849
 850         sub             $rp,$arrsz,$rp          ; rewind rp
 851         subi            0,$arrsz,$idx
 852         ldo             `$LOCALS+32`($fp),$tp
 853 L\$copy_pa11
 854         ldwx            $idx($np),$hi0
 855         stws,ma         %r0,4($tp)
 856         addib,<>        4,$idx,L\$copy_pa11
 857         stws,ma         $hi0,4($rp)
 858
 859         nop                                     ; alignment
 860 L\$done
 861 ___
 862 }
 863 \f
 864 $code.=<<___;
 865         ldi             1,%r28                  ; signal "handled"
 866         ldo             $FRAME($fp),%sp         ; destroy tp[num+1]
 867
 868         $POP    `-$FRAME-$SAVED_RP`(%sp),%r2    ; standard epilogue
 869         $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
 870         $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
 871         $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
 872         $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
 873         $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
 874         $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
 875         $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
 876 L\$abort
 877         bv      (%r2)
 878         .EXIT
 879         $POPMB  -$FRAME(%sp),%r3
 880         .PROCEND
 881         .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
 882 ___
 883 \f
 884 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
 885 # that it can be compiled with .LEVEL 1.0. It should be noted that I
 886 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
 887 # directive...
 888
 889 my $ldd = sub {
 890   my ($mod,$args) = @_;
 891   my $orig = "ldd$mod\t$args";
 892
 893     if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)         # format 4
 894     {   my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
 895         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 896     }
 897     elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)     # format 5
 898     {   my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
 899         $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);                # encode offset
 900         $opcode|=(1<<5)  if ($mod =~ /^,m/);
 901         $opcode|=(1<<13) if ($mod =~ /^,mb/);
 902         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 903     }
 904     else { "\t".$orig; }
 905 };
 906
 907 my $std = sub {
 908   my ($mod,$args) = @_;
 909   my $orig = "std$mod\t$args";
 910
 911     if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)        # format 6
 912     {   my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
 913         $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);                  # encode offset
 914         $opcode|=(1<<5)  if ($mod =~ /^,m/);
 915         $opcode|=(1<<13) if ($mod =~ /^,mb/);
 916         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 917     }
 918     else { "\t".$orig; }
 919 };
 920
 921 my $extrd = sub {
 922   my ($mod,$args) = @_;
 923   my $orig = "extrd$mod\t$args";
 924
 925     # I only have ",u" completer, it's implicitly encoded...
 926     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
 927     {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
 928         my $len=32-$3;
 929         $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
 930         $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
 931         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 932     }
 933     elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
 934     {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
 935         my $len=32-$2;
 936         $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
 937         $opcode |= (1<<13) if ($mod =~ /,\**=/);
 938         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 939     }
 940     else { "\t".$orig; }
 941 };
 942
 943 my $shrpd = sub {
 944   my ($mod,$args) = @_;
 945   my $orig = "shrpd$mod\t$args";
 946
 947     if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
 948     {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
 949         my $cpos=63-$3;
 950         $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
 951         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 952     }
 953     else { "\t".$orig; }
 954 };
 955
 956 my $sub = sub {
 957   my ($mod,$args) = @_;
 958   my $orig = "sub$mod\t$args";
 959
 960     if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
 961         my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
 962         $opcode|=(1<<10);       # e1
 963         $opcode|=(1<<8);        # e2
 964         $opcode|=(1<<5);        # d
 965         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
 966     }
 967     else { "\t".$orig; }
 968 };
 969
 970 sub assemble {
 971   my ($mnemonic,$mod,$args)=@_;
 972   my $opcode = eval("\$$mnemonic");
 973
 974     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
 975 }
 976
 977 foreach (split("\n",$code)) {
 978         s/\`([^\`]*)\`/eval $1/ge;
 979         # flip word order in 64-bit mode...
 980         s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
 981         # assemble 2.0 instructions in 32-bit mode...
 982         s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
 983
 984         print $_,"\n";
 985 }
 986 close STDOUT;