crypto/bn/asm/parisc-mont.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16
  17 # On PA-7100LC this module performs ~90-50% better, less for longer
  18 # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
  19 # that compiler utilized xmpyu instruction to perform 32x32=64-bit
  20 # multiplication, which in turn means that "baseline" performance was
  21 # optimal in respect to instruction set capabilities. Fair comparison
  22 # with vendor compiler is problematic, because OpenSSL doesn't define
  23 # BN_LLONG [presumably] for historical reasons, which drives compiler
  24 # toward 4 times 16x16=32-bit multiplications [plus complementary
  25 # shifts and additions] instead. This means that you should observe
  26 # several times improvement over code generated by vendor compiler
  27 # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
  28 # improvement coefficient was never collected on PA-7100LC, or any
  29 # other 1.1 CPU, because I don't have access to such machine with
  30 # vendor compiler. But to give you a taste, PA-RISC 1.1 code path
  31 # reportedly outperformed code generated by cc +DA1.1 +O3 by factor
  32 # of ~5x on PA-8600.
  33 #
  34 # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
  35 # reportedly ~2x faster than vendor compiler generated code [according
  36 # to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
  37 # this implementation is actually 32-bit one, in the sense that it
  38 # operates on 32-bit values. But pa-risc2[W].s operates on arrays of
  39 # 64-bit BN_LONGs... How do they interoperate then? No problem. This
  40 # module picks halves of 64-bit values in reverse order and pretends
  41 # they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
  42 # 64-bit code such as pa-risc2[W].s then? Well, the thing is that
  43 # 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
  44 # i.e. there is no "wider" multiplication like on most other 64-bit
  45 # platforms. This means that even being effectively 32-bit, this
  46 # implementation performs "64-bit" computational task in same amount
  47 # of arithmetic operations, most notably multiplications. It requires
  48 # more memory references, most notably to tp[num], but this doesn't
  49 # seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
  50 # 2.0 code path provides virtually same performance as pa-risc2[W].s:
  51 # it's ~10% better for shortest key length and ~10% worse for longest
  52 # one.
  53 #
  54 # In case it wasn't clear. The module has two distinct code paths:
  55 # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
  56 # additions and 64-bit integer loads, not to mention specific
  57 # instruction scheduling. In 64-bit build naturally only 2.0 code path
  58 # is assembled. In 32-bit application context both code paths are
  59 # assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
  60 # is taken automatically. Also, in 32-bit build the module imposes
  61 # couple of limitations: vector lengths has to be even and vector
  62 # addresses has to be 64-bit aligned. Normally neither is a problem:
  63 # most common key lengths are even and vectors are commonly malloc-ed,
  64 # which ensures alignment.
  65 #
  66 # Special thanks to polarhome.com for providing HP-UX account on
  67 # PA-RISC 1.1 machine, and to correspondent who chose to remain
  68 # anonymous for testing the code on PA-RISC 2.0 machine.
  69 \f
  70 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  71
  72 $flavour = shift;
  73 $output = shift;
  74
  75 open STDOUT,">$output";
  76
  77 if ($flavour =~ /64/) {
  78         $LEVEL          ="2.0W";
  79         $SIZE_T         =8;
  80         $FRAME_MARKER   =80;
  81         $SAVED_RP       =16;
  82         $PUSH           ="std";
  83         $PUSHMA         ="std,ma";
  84         $POP            ="ldd";
  85         $POPMB          ="ldd,mb";
  86         $BN_SZ          =$SIZE_T;
  87 } else {
  88         $LEVEL          ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
  89         $SIZE_T         =4;
  90         $FRAME_MARKER   =48;
  91         $SAVED_RP       =20;
  92         $PUSH           ="stw";
  93         $PUSHMA         ="stwm";
  94         $POP            ="ldw";
  95         $POPMB          ="ldwm";
  96         $BN_SZ          =$SIZE_T;
  97         if (open CONF,"<${dir}../../opensslconf.h") {
  98             while(<CONF>) {
  99                 if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
 100                     $BN_SZ=8;
 101                     $LEVEL="2.0";
 102                     last;
 103                 }
 104             }
 105             close CONF;
 106         }
 107 }
 108
 109 $FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
 110                                 #                [+ argument transfer]
 111 $LOCALS=$FRAME-$FRAME_MARKER;
 112 $FRAME+=32;                     # local variables
 113
 114 $tp="%r31";
 115 $ti1="%r29";
 116 $ti0="%r28";
 117
 118 $rp="%r26";
 119 $ap="%r25";
 120 $bp="%r24";
 121 $np="%r23";
 122 $n0="%r22";     # passed through stack in 32-bit
 123 $num="%r21";    # passed through stack in 32-bit
 124 $idx="%r20";
 125 $arrsz="%r19";
 126
 127 $nm1="%r7";
 128 $nm0="%r6";
 129 $ab1="%r5";
 130 $ab0="%r4";
 131
 132 $fp="%r3";
 133 $hi1="%r2";
 134 $hi0="%r1";
 135
 136 $xfer=$n0;      # accommodates [-16..15] offset in fld[dw]s
 137
 138 $fm0="%fr4";    $fti=$fm0;
 139 $fbi="%fr5L";
 140 $fn0="%fr5R";
 141 $fai="%fr6";    $fab0="%fr7";   $fab1="%fr8";
 142 $fni="%fr9";    $fnm0="%fr10";  $fnm1="%fr11";
 143
 144 $code=<<___;
 145         .LEVEL  $LEVEL
 146         .SPACE  \$TEXT\$
 147         .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
 148
 149         .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
 150         .ALIGN  64
 151 bn_mul_mont
 152         .PROC
 153         .CALLINFO       FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
 154         .ENTRY
 155         $PUSH   %r2,-$SAVED_RP(%sp)             ; standard prologue
 156         $PUSHMA %r3,$FRAME(%sp)
 157         $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
 158         $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
 159         $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
 160         $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
 161         $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
 162         $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
 163         $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
 164         ldo     -$FRAME(%sp),$fp
 165 ___
 166 $code.=<<___ if ($SIZE_T==4);
 167         ldw     `-$FRAME_MARKER-4`($fp),$n0
 168         ldw     `-$FRAME_MARKER-8`($fp),$num
 169         nop
 170         nop                                     ; alignment
 171 ___
 172 $code.=<<___ if ($BN_SZ==4);
 173         comiclr,<=      6,$num,%r0              ; are vectors long enough?
 174         b               L\$abort
 175         ldi             0,%r28                  ; signal "unhandled"
 176         add,ev          %r0,$num,$num           ; is $num even?
 177         b               L\$abort
 178         nop
 179         or              $ap,$np,$ti1
 180         extru,=         $ti1,31,3,%r0           ; are ap and np 64-bit aligned?
 181         b               L\$abort
 182         nop
 183         nop                                     ; alignment
 184         nop
 185
 186         fldws           0($n0),${fn0}
 187         fldws,ma        4($bp),${fbi}           ; bp[0]
 188 ___
 189 $code.=<<___ if ($BN_SZ==8);
 190         comib,>         3,$num,L\$abort         ; are vectors long enough?
 191         ldi             0,%r28                  ; signal "unhandled"
 192         addl            $num,$num,$num          ; I operate on 32-bit values
 193
 194         fldws           4($n0),${fn0}           ; only low part of n0
 195         fldws           4($bp),${fbi}           ; bp[0] in flipped word order
 196 ___
 197 $code.=<<___;
 198         fldds           0($ap),${fai}           ; ap[0,1]
 199         fldds           0($np),${fni}           ; np[0,1]
 200
 201         sh2addl         $num,%r0,$arrsz
 202         ldi             31,$hi0
 203         ldo             36($arrsz),$hi1         ; space for tp[num+1]
 204         andcm           $hi1,$hi0,$hi1          ; align
 205         addl            $hi1,%sp,%sp
 206         $PUSH           $fp,-$SIZE_T(%sp)
 207
 208         ldo             `$LOCALS+16`($fp),$xfer
 209         ldo             `$LOCALS+32+4`($fp),$tp
 210
 211         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[0]*bp[0]
 212         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[1]*bp[0]
 213         xmpyu           ${fn0},${fab0}R,${fm0}
 214
 215         addl            $arrsz,$ap,$ap          ; point at the end
 216         addl            $arrsz,$np,$np
 217         subi            0,$arrsz,$idx           ; j=0
 218         ldo             8($idx),$idx            ; j++++
 219
 220         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
 221         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
 222         fstds           ${fab0},-16($xfer)
 223         fstds           ${fnm0},-8($xfer)
 224         fstds           ${fab1},0($xfer)
 225         fstds           ${fnm1},8($xfer)
 226          flddx          $idx($ap),${fai}        ; ap[2,3]
 227          flddx          $idx($np),${fni}        ; np[2,3]
 228 ___
 229 $code.=<<___ if ($BN_SZ==4);
 230         mtctl           $hi0,%cr11              ; $hi0 still holds 31
 231         extrd,u,*=      $hi0,%sar,1,$hi0        ; executes on PA-RISC 1.0
 232         b               L\$parisc11
 233         nop
 234 ___
 235 $code.=<<___;                                   # PA-RISC 2.0 code-path
 236         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
 237         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 238         ldd             -16($xfer),$ab0
 239         fstds           ${fab0},-16($xfer)
 240
 241         extrd,u         $ab0,31,32,$hi0
 242         extrd,u         $ab0,63,32,$ab0
 243         ldd             -8($xfer),$nm0
 244         fstds           ${fnm0},-8($xfer)
 245          ldo            8($idx),$idx            ; j++++
 246          addl           $ab0,$nm0,$nm0          ; low part is discarded
 247          extrd,u        $nm0,31,32,$hi1
 248 \f
 249 L\$1st
 250         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
 251         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
 252         ldd             0($xfer),$ab1
 253         fstds           ${fab1},0($xfer)
 254          addl           $hi0,$ab1,$ab1
 255          extrd,u        $ab1,31,32,$hi0
 256         ldd             8($xfer),$nm1
 257         fstds           ${fnm1},8($xfer)
 258          extrd,u        $ab1,63,32,$ab1
 259          addl           $hi1,$nm1,$nm1
 260         flddx           $idx($ap),${fai}        ; ap[j,j+1]
 261         flddx           $idx($np),${fni}        ; np[j,j+1]
 262          addl           $ab1,$nm1,$nm1
 263          extrd,u        $nm1,31,32,$hi1
 264
 265         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
 266         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 267         ldd             -16($xfer),$ab0
 268         fstds           ${fab0},-16($xfer)
 269          addl           $hi0,$ab0,$ab0
 270          extrd,u        $ab0,31,32,$hi0
 271         ldd             -8($xfer),$nm0
 272         fstds           ${fnm0},-8($xfer)
 273          extrd,u        $ab0,63,32,$ab0
 274          addl           $hi1,$nm0,$nm0
 275         stw             $nm1,-4($tp)            ; tp[j-1]
 276          addl           $ab0,$nm0,$nm0
 277          stw,ma         $nm0,8($tp)             ; tp[j-1]
 278         addib,<>        8,$idx,L\$1st           ; j++++
 279          extrd,u        $nm0,31,32,$hi1
 280
 281         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
 282         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
 283         ldd             0($xfer),$ab1
 284         fstds           ${fab1},0($xfer)
 285          addl           $hi0,$ab1,$ab1
 286          extrd,u        $ab1,31,32,$hi0
 287         ldd             8($xfer),$nm1
 288         fstds           ${fnm1},8($xfer)
 289          extrd,u        $ab1,63,32,$ab1
 290          addl           $hi1,$nm1,$nm1
 291         ldd             -16($xfer),$ab0
 292          addl           $ab1,$nm1,$nm1
 293         ldd             -8($xfer),$nm0
 294          extrd,u        $nm1,31,32,$hi1
 295
 296          addl           $hi0,$ab0,$ab0
 297          extrd,u        $ab0,31,32,$hi0
 298         stw             $nm1,-4($tp)            ; tp[j-1]
 299          extrd,u        $ab0,63,32,$ab0
 300          addl           $hi1,$nm0,$nm0
 301         ldd             0($xfer),$ab1
 302          addl           $ab0,$nm0,$nm0
 303         ldd,mb          8($xfer),$nm1
 304          extrd,u        $nm0,31,32,$hi1
 305         stw,ma          $nm0,8($tp)             ; tp[j-1]
 306
 307         ldo             -1($num),$num           ; i--
 308         subi            0,$arrsz,$idx           ; j=0
 309 ___
 310 $code.=<<___ if ($BN_SZ==4);
 311         fldws,ma        4($bp),${fbi}           ; bp[1]
 312 ___
 313 $code.=<<___ if ($BN_SZ==8);
 314         fldws           0($bp),${fbi}           ; bp[1] in flipped word order
 315 ___
 316 $code.=<<___;
 317          flddx          $idx($ap),${fai}        ; ap[0,1]
 318          flddx          $idx($np),${fni}        ; np[0,1]
 319          fldws          8($xfer),${fti}R        ; tp[0]
 320         addl            $hi0,$ab1,$ab1
 321          extrd,u        $ab1,31,32,$hi0
 322          extrd,u        $ab1,63,32,$ab1
 323          ldo            8($idx),$idx            ; j++++
 324          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
 325          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
 326         addl            $hi1,$nm1,$nm1
 327         addl            $ab1,$nm1,$nm1
 328         extrd,u         $nm1,31,32,$hi1
 329          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
 330         stw             $nm1,-4($tp)            ; tp[j-1]
 331
 332          fcpy,sgl       %fr0,${fti}L            ; zero high part
 333          fcpy,sgl       %fr0,${fab0}L
 334         addl            $hi1,$hi0,$hi0
 335         extrd,u         $hi0,31,32,$hi1
 336          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
 337          fcnvxf,dbl,dbl ${fab0},${fab0}
 338         stw             $hi0,0($tp)
 339         stw             $hi1,4($tp)
 340
 341         fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
 342         fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
 343         xmpyu           ${fn0},${fab0}R,${fm0}
 344         ldo             `$LOCALS+32+4`($fp),$tp
 345 L\$outer
 346         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
 347         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
 348         fstds           ${fab0},-16($xfer)      ; 33-bit value
 349         fstds           ${fnm0},-8($xfer)
 350          flddx          $idx($ap),${fai}        ; ap[2]
 351          flddx          $idx($np),${fni}        ; np[2]
 352          ldo            8($idx),$idx            ; j++++
 353         ldd             -16($xfer),$ab0         ; 33-bit value
 354         ldd             -8($xfer),$nm0
 355         ldw             0($xfer),$hi0           ; high part
 356
 357         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
 358         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 359          extrd,u        $ab0,31,32,$ti0         ; carry bit
 360          extrd,u        $ab0,63,32,$ab0
 361         fstds           ${fab1},0($xfer)
 362          addl           $ti0,$hi0,$hi0          ; account carry bit
 363         fstds           ${fnm1},8($xfer)
 364          addl           $ab0,$nm0,$nm0          ; low part is discarded
 365         ldw             0($tp),$ti1             ; tp[1]
 366          extrd,u        $nm0,31,32,$hi1
 367         fstds           ${fab0},-16($xfer)
 368         fstds           ${fnm0},-8($xfer)
 369 \f
 370 L\$inner
 371         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
 372         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
 373         ldd             0($xfer),$ab1
 374         fstds           ${fab1},0($xfer)
 375          addl           $hi0,$ti1,$ti1
 376          addl           $ti1,$ab1,$ab1
 377         ldd             8($xfer),$nm1
 378         fstds           ${fnm1},8($xfer)
 379          extrd,u        $ab1,31,32,$hi0
 380          extrd,u        $ab1,63,32,$ab1
 381         flddx           $idx($ap),${fai}        ; ap[j,j+1]
 382         flddx           $idx($np),${fni}        ; np[j,j+1]
 383          addl           $hi1,$nm1,$nm1
 384          addl           $ab1,$nm1,$nm1
 385         ldw             4($tp),$ti0             ; tp[j]
 386         stw             $nm1,-4($tp)            ; tp[j-1]
 387
 388         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
 389         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 390         ldd             -16($xfer),$ab0
 391         fstds           ${fab0},-16($xfer)
 392          addl           $hi0,$ti0,$ti0
 393          addl           $ti0,$ab0,$ab0
 394         ldd             -8($xfer),$nm0
 395         fstds           ${fnm0},-8($xfer)
 396          extrd,u        $ab0,31,32,$hi0
 397          extrd,u        $nm1,31,32,$hi1
 398         ldw             8($tp),$ti1             ; tp[j]
 399          extrd,u        $ab0,63,32,$ab0
 400          addl           $hi1,$nm0,$nm0
 401          addl           $ab0,$nm0,$nm0
 402          stw,ma         $nm0,8($tp)             ; tp[j-1]
 403         addib,<>        8,$idx,L\$inner         ; j++++
 404          extrd,u        $nm0,31,32,$hi1
 405
 406         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
 407         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
 408         ldd             0($xfer),$ab1
 409         fstds           ${fab1},0($xfer)
 410          addl           $hi0,$ti1,$ti1
 411          addl           $ti1,$ab1,$ab1
 412         ldd             8($xfer),$nm1
 413         fstds           ${fnm1},8($xfer)
 414          extrd,u        $ab1,31,32,$hi0
 415          extrd,u        $ab1,63,32,$ab1
 416         ldw             4($tp),$ti0             ; tp[j]
 417          addl           $hi1,$nm1,$nm1
 418          addl           $ab1,$nm1,$nm1
 419         ldd             -16($xfer),$ab0
 420         ldd             -8($xfer),$nm0
 421          extrd,u        $nm1,31,32,$hi1
 422
 423         addl            $hi0,$ab0,$ab0
 424          addl           $ti0,$ab0,$ab0
 425          stw            $nm1,-4($tp)            ; tp[j-1]
 426          extrd,u        $ab0,31,32,$hi0
 427         ldw             8($tp),$ti1             ; tp[j]
 428          extrd,u        $ab0,63,32,$ab0
 429          addl           $hi1,$nm0,$nm0
 430         ldd             0($xfer),$ab1
 431          addl           $ab0,$nm0,$nm0
 432         ldd,mb          8($xfer),$nm1
 433          extrd,u        $nm0,31,32,$hi1
 434          stw,ma         $nm0,8($tp)             ; tp[j-1]
 435
 436         addib,=         -1,$num,L\$outerdone    ; i--
 437         subi            0,$arrsz,$idx           ; j=0
 438 ___
 439 $code.=<<___ if ($BN_SZ==4);
 440         fldws,ma        4($bp),${fbi}           ; bp[i]
 441 ___
 442 $code.=<<___ if ($BN_SZ==8);
 443         ldi             12,$ti0                 ; bp[i] in flipped word order
 444         addl,ev         %r0,$num,$num
 445         ldi             -4,$ti0
 446         addl            $ti0,$bp,$bp
 447         fldws           0($bp),${fbi}
 448 ___
 449 $code.=<<___;
 450          flddx          $idx($ap),${fai}        ; ap[0]
 451         addl            $hi0,$ab1,$ab1
 452          flddx          $idx($np),${fni}        ; np[0]
 453          fldws          8($xfer),${fti}R        ; tp[0]
 454         addl            $ti1,$ab1,$ab1
 455         extrd,u         $ab1,31,32,$hi0
 456         extrd,u         $ab1,63,32,$ab1
 457
 458          ldo            8($idx),$idx            ; j++++
 459          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
 460          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
 461         ldw             4($tp),$ti0             ; tp[j]
 462
 463         addl            $hi1,$nm1,$nm1
 464          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
 465         addl            $ab1,$nm1,$nm1
 466         extrd,u         $nm1,31,32,$hi1
 467          fcpy,sgl       %fr0,${fti}L            ; zero high part
 468          fcpy,sgl       %fr0,${fab0}L
 469         stw             $nm1,-4($tp)            ; tp[j-1]
 470
 471          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
 472          fcnvxf,dbl,dbl ${fab0},${fab0}
 473         addl            $hi1,$hi0,$hi0
 474          fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
 475         addl            $ti0,$hi0,$hi0
 476         extrd,u         $hi0,31,32,$hi1
 477          fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
 478         stw             $hi0,0($tp)
 479         stw             $hi1,4($tp)
 480          xmpyu          ${fn0},${fab0}R,${fm0}
 481
 482         b               L\$outer
 483         ldo             `$LOCALS+32+4`($fp),$tp
 484 \f
 485 L\$outerdone
 486         addl            $hi0,$ab1,$ab1
 487         addl            $ti1,$ab1,$ab1
 488         extrd,u         $ab1,31,32,$hi0
 489         extrd,u         $ab1,63,32,$ab1
 490
 491         ldw             4($tp),$ti0             ; tp[j]
 492
 493         addl            $hi1,$nm1,$nm1
 494         addl            $ab1,$nm1,$nm1
 495         extrd,u         $nm1,31,32,$hi1
 496         stw             $nm1,-4($tp)            ; tp[j-1]
 497
 498         addl            $hi1,$hi0,$hi0
 499         addl            $ti0,$hi0,$hi0
 500         extrd,u         $hi0,31,32,$hi1
 501         stw             $hi0,0($tp)
 502         stw             $hi1,4($tp)
 503
 504         ldo             `$LOCALS+32`($fp),$tp
 505         sub             %r0,%r0,%r0             ; clear borrow
 506 ___
 507 $code.=<<___ if ($BN_SZ==4);
 508         ldws,ma         4($tp),$ti0
 509         extru,=         $rp,31,3,%r0            ; is rp 64-bit aligned?
 510         b               L\$sub_pa11
 511         addl            $tp,$arrsz,$tp
 512 L\$sub
 513         ldwx            $idx($np),$hi0
 514         subb            $ti0,$hi0,$hi1
 515         ldwx            $idx($tp),$ti0
 516         addib,<>        4,$idx,L\$sub
 517         stws,ma         $hi1,4($rp)
 518
 519         subb            $ti0,%r0,$hi1
 520 ___
 521 $code.=<<___ if ($BN_SZ==8);
 522         ldd,ma          8($tp),$ti0
 523 L\$sub
 524         ldd             $idx($np),$hi0
 525         shrpd           $ti0,$ti0,32,$ti0       ; flip word order
 526         std             $ti0,-8($tp)            ; save flipped value
 527         sub,db          $ti0,$hi0,$hi1
 528         ldd,ma          8($tp),$ti0
 529         addib,<>        8,$idx,L\$sub
 530         std,ma          $hi1,8($rp)
 531
 532         extrd,u         $ti0,31,32,$ti0         ; carry in flipped word order
 533         sub,db          $ti0,%r0,$hi1
 534 ___
 535 $code.=<<___;
 536         ldo             `$LOCALS+32`($fp),$tp
 537         sub             $rp,$arrsz,$rp          ; rewind rp
 538         subi            0,$arrsz,$idx
 539 L\$copy
 540         ldd             0($tp),$ti0
 541         ldd             0($rp),$hi0
 542         std,ma          %r0,8($tp)
 543         comiclr,=       0,$hi1,%r0
 544         copy            $ti0,$hi0
 545         addib,<>        8,$idx,L\$copy
 546         std,ma          $hi0,8($rp)
 547 ___
 548
 549 if ($BN_SZ==4) {                                # PA-RISC 1.1 code-path
 550 $ablo=$ab0;
 551 $abhi=$ab1;
 552 $nmlo0=$nm0;
 553 $nmhi0=$nm1;
 554 $nmlo1="%r9";
 555 $nmhi1="%r8";
 556
 557 $code.=<<___;
 558         b               L\$done
 559         nop
 560
 561         .ALIGN          8
 562 L\$parisc11
 563         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
 564         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 565         ldw             -12($xfer),$ablo
 566         ldw             -16($xfer),$hi0
 567         ldw             -4($xfer),$nmlo0
 568         ldw             -8($xfer),$nmhi0
 569         fstds           ${fab0},-16($xfer)
 570         fstds           ${fnm0},-8($xfer)
 571
 572          ldo            8($idx),$idx            ; j++++
 573          add            $ablo,$nmlo0,$nmlo0     ; discarded
 574          addc           %r0,$nmhi0,$hi1
 575         ldw             4($xfer),$ablo
 576         ldw             0($xfer),$abhi
 577         nop
 578 \f
 579 L\$1st_pa11
 580         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
 581         flddx           $idx($ap),${fai}        ; ap[j,j+1]
 582         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
 583         flddx           $idx($np),${fni}        ; np[j,j+1]
 584          add            $hi0,$ablo,$ablo
 585         ldw             12($xfer),$nmlo1
 586          addc           %r0,$abhi,$hi0
 587         ldw             8($xfer),$nmhi1
 588          add            $ablo,$nmlo1,$nmlo1
 589         fstds           ${fab1},0($xfer)
 590          addc           %r0,$nmhi1,$nmhi1
 591         fstds           ${fnm1},8($xfer)
 592          add            $hi1,$nmlo1,$nmlo1
 593         ldw             -12($xfer),$ablo
 594          addc           %r0,$nmhi1,$hi1
 595         ldw             -16($xfer),$abhi
 596
 597         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
 598         ldw             -4($xfer),$nmlo0
 599         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 600         ldw             -8($xfer),$nmhi0
 601          add            $hi0,$ablo,$ablo
 602         stw             $nmlo1,-4($tp)          ; tp[j-1]
 603          addc           %r0,$abhi,$hi0
 604         fstds           ${fab0},-16($xfer)
 605          add            $ablo,$nmlo0,$nmlo0
 606         fstds           ${fnm0},-8($xfer)
 607          addc           %r0,$nmhi0,$nmhi0
 608         ldw             0($xfer),$abhi
 609          add            $hi1,$nmlo0,$nmlo0
 610         ldw             4($xfer),$ablo
 611          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
 612         addib,<>        8,$idx,L\$1st_pa11      ; j++++
 613          addc           %r0,$nmhi0,$hi1
 614
 615          ldw            8($xfer),$nmhi1
 616          ldw            12($xfer),$nmlo1
 617         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
 618         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
 619          add            $hi0,$ablo,$ablo
 620         fstds           ${fab1},0($xfer)
 621          addc           %r0,$abhi,$hi0
 622         fstds           ${fnm1},8($xfer)
 623          add            $ablo,$nmlo1,$nmlo1
 624         ldw             -16($xfer),$abhi
 625          addc           %r0,$nmhi1,$nmhi1
 626         ldw             -12($xfer),$ablo
 627          add            $hi1,$nmlo1,$nmlo1
 628         ldw             -8($xfer),$nmhi0
 629          addc           %r0,$nmhi1,$hi1
 630         ldw             -4($xfer),$nmlo0
 631
 632          add            $hi0,$ablo,$ablo
 633         stw             $nmlo1,-4($tp)          ; tp[j-1]
 634          addc           %r0,$abhi,$hi0
 635         ldw             0($xfer),$abhi
 636          add            $ablo,$nmlo0,$nmlo0
 637         ldw             4($xfer),$ablo
 638          addc           %r0,$nmhi0,$nmhi0
 639         ldws,mb         8($xfer),$nmhi1
 640          add            $hi1,$nmlo0,$nmlo0
 641         ldw             4($xfer),$nmlo1
 642          addc           %r0,$nmhi0,$hi1
 643         stws,ma         $nmlo0,8($tp)           ; tp[j-1]
 644
 645         ldo             -1($num),$num           ; i--
 646         subi            0,$arrsz,$idx           ; j=0
 647
 648          fldws,ma       4($bp),${fbi}           ; bp[1]
 649          flddx          $idx($ap),${fai}        ; ap[0,1]
 650          flddx          $idx($np),${fni}        ; np[0,1]
 651          fldws          8($xfer),${fti}R        ; tp[0]
 652         add             $hi0,$ablo,$ablo
 653         addc            %r0,$abhi,$hi0
 654          ldo            8($idx),$idx            ; j++++
 655          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
 656          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
 657         add             $hi1,$nmlo1,$nmlo1
 658         addc            %r0,$nmhi1,$nmhi1
 659         add             $ablo,$nmlo1,$nmlo1
 660         addc            %r0,$nmhi1,$hi1
 661          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
 662         stw             $nmlo1,-4($tp)          ; tp[j-1]
 663
 664          fcpy,sgl       %fr0,${fti}L            ; zero high part
 665          fcpy,sgl       %fr0,${fab0}L
 666         add             $hi1,$hi0,$hi0
 667         addc            %r0,%r0,$hi1
 668          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
 669          fcnvxf,dbl,dbl ${fab0},${fab0}
 670         stw             $hi0,0($tp)
 671         stw             $hi1,4($tp)
 672
 673         fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
 674         fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
 675         xmpyu           ${fn0},${fab0}R,${fm0}
 676         ldo             `$LOCALS+32+4`($fp),$tp
 677 L\$outer_pa11
 678         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
 679         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
 680         fstds           ${fab0},-16($xfer)      ; 33-bit value
 681         fstds           ${fnm0},-8($xfer)
 682          flddx          $idx($ap),${fai}        ; ap[2,3]
 683          flddx          $idx($np),${fni}        ; np[2,3]
 684         ldw             -16($xfer),$abhi        ; carry bit actually
 685          ldo            8($idx),$idx            ; j++++
 686         ldw             -12($xfer),$ablo
 687         ldw             -8($xfer),$nmhi0
 688         ldw             -4($xfer),$nmlo0
 689         ldw             0($xfer),$hi0           ; high part
 690
 691         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
 692         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 693         fstds           ${fab1},0($xfer)
 694          addl           $abhi,$hi0,$hi0         ; account carry bit
 695         fstds           ${fnm1},8($xfer)
 696          add            $ablo,$nmlo0,$nmlo0     ; discarded
 697         ldw             0($tp),$ti1             ; tp[1]
 698          addc           %r0,$nmhi0,$hi1
 699         fstds           ${fab0},-16($xfer)
 700         fstds           ${fnm0},-8($xfer)
 701         ldw             4($xfer),$ablo
 702         ldw             0($xfer),$abhi
 703 \f
 704 L\$inner_pa11
 705         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
 706         flddx           $idx($ap),${fai}        ; ap[j,j+1]
 707         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
 708         flddx           $idx($np),${fni}        ; np[j,j+1]
 709          add            $hi0,$ablo,$ablo
 710         ldw             4($tp),$ti0             ; tp[j]
 711          addc           %r0,$abhi,$abhi
 712         ldw             12($xfer),$nmlo1
 713          add            $ti1,$ablo,$ablo
 714         ldw             8($xfer),$nmhi1
 715          addc           %r0,$abhi,$hi0
 716         fstds           ${fab1},0($xfer)
 717          add            $ablo,$nmlo1,$nmlo1
 718         fstds           ${fnm1},8($xfer)
 719          addc           %r0,$nmhi1,$nmhi1
 720         ldw             -12($xfer),$ablo
 721          add            $hi1,$nmlo1,$nmlo1
 722         ldw             -16($xfer),$abhi
 723          addc           %r0,$nmhi1,$hi1
 724
 725         xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
 726         ldw             8($tp),$ti1             ; tp[j]
 727         xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
 728         ldw             -4($xfer),$nmlo0
 729          add            $hi0,$ablo,$ablo
 730         ldw             -8($xfer),$nmhi0
 731          addc           %r0,$abhi,$abhi
 732         stw             $nmlo1,-4($tp)          ; tp[j-1]
 733          add            $ti0,$ablo,$ablo
 734         fstds           ${fab0},-16($xfer)
 735          addc           %r0,$abhi,$hi0
 736         fstds           ${fnm0},-8($xfer)
 737          add            $ablo,$nmlo0,$nmlo0
 738         ldw             4($xfer),$ablo
 739          addc           %r0,$nmhi0,$nmhi0
 740         ldw             0($xfer),$abhi
 741          add            $hi1,$nmlo0,$nmlo0
 742          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
 743         addib,<>        8,$idx,L\$inner_pa11    ; j++++
 744          addc           %r0,$nmhi0,$hi1
 745
 746         xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
 747         ldw             12($xfer),$nmlo1
 748         xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
 749         ldw             8($xfer),$nmhi1
 750          add            $hi0,$ablo,$ablo
 751         ldw             4($tp),$ti0             ; tp[j]
 752          addc           %r0,$abhi,$abhi
 753         fstds           ${fab1},0($xfer)
 754          add            $ti1,$ablo,$ablo
 755         fstds           ${fnm1},8($xfer)
 756          addc           %r0,$abhi,$hi0
 757         ldw             -16($xfer),$abhi
 758          add            $ablo,$nmlo1,$nmlo1
 759         ldw             -12($xfer),$ablo
 760          addc           %r0,$nmhi1,$nmhi1
 761         ldw             -8($xfer),$nmhi0
 762          add            $hi1,$nmlo1,$nmlo1
 763         ldw             -4($xfer),$nmlo0
 764          addc           %r0,$nmhi1,$hi1
 765
 766         add             $hi0,$ablo,$ablo
 767          stw            $nmlo1,-4($tp)          ; tp[j-1]
 768         addc            %r0,$abhi,$abhi
 769          add            $ti0,$ablo,$ablo
 770         ldw             8($tp),$ti1             ; tp[j]
 771          addc           %r0,$abhi,$hi0
 772         ldw             0($xfer),$abhi
 773          add            $ablo,$nmlo0,$nmlo0
 774         ldw             4($xfer),$ablo
 775          addc           %r0,$nmhi0,$nmhi0
 776         ldws,mb         8($xfer),$nmhi1
 777          add            $hi1,$nmlo0,$nmlo0
 778         ldw             4($xfer),$nmlo1
 779          addc           %r0,$nmhi0,$hi1
 780          stws,ma        $nmlo0,8($tp)           ; tp[j-1]
 781
 782         addib,=         -1,$num,L\$outerdone_pa11; i--
 783         subi            0,$arrsz,$idx           ; j=0
 784
 785          fldws,ma       4($bp),${fbi}           ; bp[i]
 786          flddx          $idx($ap),${fai}        ; ap[0]
 787         add             $hi0,$ablo,$ablo
 788         addc            %r0,$abhi,$abhi
 789          flddx          $idx($np),${fni}        ; np[0]
 790          fldws          8($xfer),${fti}R        ; tp[0]
 791         add             $ti1,$ablo,$ablo
 792         addc            %r0,$abhi,$hi0
 793
 794          ldo            8($idx),$idx            ; j++++
 795          xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
 796          xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
 797         ldw             4($tp),$ti0             ; tp[j]
 798
 799         add             $hi1,$nmlo1,$nmlo1
 800         addc            %r0,$nmhi1,$nmhi1
 801          fstws,mb       ${fab0}L,-8($xfer)      ; save high part
 802         add             $ablo,$nmlo1,$nmlo1
 803         addc            %r0,$nmhi1,$hi1
 804          fcpy,sgl       %fr0,${fti}L            ; zero high part
 805          fcpy,sgl       %fr0,${fab0}L
 806         stw             $nmlo1,-4($tp)          ; tp[j-1]
 807
 808          fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
 809          fcnvxf,dbl,dbl ${fab0},${fab0}
 810         add             $hi1,$hi0,$hi0
 811         addc            %r0,%r0,$hi1
 812          fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
 813         add             $ti0,$hi0,$hi0
 814         addc            %r0,$hi1,$hi1
 815          fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
 816         stw             $hi0,0($tp)
 817         stw             $hi1,4($tp)
 818          xmpyu          ${fn0},${fab0}R,${fm0}
 819
 820         b               L\$outer_pa11
 821         ldo             `$LOCALS+32+4`($fp),$tp
 822 \f
 823 L\$outerdone_pa11
 824         add             $hi0,$ablo,$ablo
 825         addc            %r0,$abhi,$abhi
 826         add             $ti1,$ablo,$ablo
 827         addc            %r0,$abhi,$hi0
 828
 829         ldw             4($tp),$ti0             ; tp[j]
 830
 831         add             $hi1,$nmlo1,$nmlo1
 832         addc            %r0,$nmhi1,$nmhi1
 833         add             $ablo,$nmlo1,$nmlo1
 834         addc            %r0,$nmhi1,$hi1
 835         stw             $nmlo1,-4($tp)          ; tp[j-1]
 836
 837         add             $hi1,$hi0,$hi0
 838         addc            %r0,%r0,$hi1
 839         add             $ti0,$hi0,$hi0
 840         addc            %r0,$hi1,$hi1
 841         stw             $hi0,0($tp)
 842         stw             $hi1,4($tp)
 843
 844         ldo             `$LOCALS+32+4`($fp),$tp
 845         sub             %r0,%r0,%r0             ; clear borrow
 846         ldw             -4($tp),$ti0
 847         addl            $tp,$arrsz,$tp
 848 L\$sub_pa11
 849         ldwx            $idx($np),$hi0
 850         subb            $ti0,$hi0,$hi1
 851         ldwx            $idx($tp),$ti0
 852         addib,<>        4,$idx,L\$sub_pa11
 853         stws,ma         $hi1,4($rp)
 854
 855         subb            $ti0,%r0,$hi1
 856
 857         ldo             `$LOCALS+32`($fp),$tp
 858         sub             $rp,$arrsz,$rp          ; rewind rp
 859         subi            0,$arrsz,$idx
 860 L\$copy_pa11
 861         ldw             0($tp),$ti0
 862         ldw             0($rp),$hi0
 863         stws,ma         %r0,4($tp)
 864         comiclr,=       0,$hi1,%r0
 865         copy            $ti0,$hi0
 866         addib,<>        4,$idx,L\$copy_pa11
 867         stws,ma         $hi0,4($rp)
 868
 869         nop                                     ; alignment
 870 L\$done
 871 ___
 872 }
 873 \f
 874 $code.=<<___;
 875         ldi             1,%r28                  ; signal "handled"
 876         ldo             $FRAME($fp),%sp         ; destroy tp[num+1]
 877
 878         $POP    `-$FRAME-$SAVED_RP`(%sp),%r2    ; standard epilogue
 879         $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
 880         $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
 881         $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
 882         $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
 883         $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
 884         $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
 885         $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
 886 L\$abort
 887         bv      (%r2)
 888         .EXIT
 889         $POPMB  -$FRAME(%sp),%r3
 890         .PROCEND
 891         .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
 892 ___
 893 \f
 894 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
 895 # that it can be compiled with .LEVEL 1.0. It should be noted that I
 896 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
 897 # directive...
 898
 899 my $ldd = sub {
 900   my ($mod,$args) = @_;
 901   my $orig = "ldd$mod\t$args";
 902
 903     if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)         # format 4
 904     {   my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
 905         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 906     }
 907     elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)     # format 5
 908     {   my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
 909         $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);                # encode offset
 910         $opcode|=(1<<5)  if ($mod =~ /^,m/);
 911         $opcode|=(1<<13) if ($mod =~ /^,mb/);
 912         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 913     }
 914     else { "\t".$orig; }
 915 };
 916
 917 my $std = sub {
 918   my ($mod,$args) = @_;
 919   my $orig = "std$mod\t$args";
 920
 921     if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)        # format 6
 922     {   my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
 923         $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);                  # encode offset
 924         $opcode|=(1<<5)  if ($mod =~ /^,m/);
 925         $opcode|=(1<<13) if ($mod =~ /^,mb/);
 926         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 927     }
 928     else { "\t".$orig; }
 929 };
 930
 931 my $extrd = sub {
 932   my ($mod,$args) = @_;
 933   my $orig = "extrd$mod\t$args";
 934
 935     # I only have ",u" completer, it's implicitly encoded...
 936     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
 937     {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
 938         my $len=32-$3;
 939         $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
 940         $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
 941         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 942     }
 943     elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
 944     {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
 945         my $len=32-$2;
 946         $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
 947         $opcode |= (1<<13) if ($mod =~ /,\**=/);
 948         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 949     }
 950     else { "\t".$orig; }
 951 };
 952
 953 my $shrpd = sub {
 954   my ($mod,$args) = @_;
 955   my $orig = "shrpd$mod\t$args";
 956
 957     if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
 958     {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
 959         my $cpos=63-$3;
 960         $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
 961         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
 962     }
 963     else { "\t".$orig; }
 964 };
 965
 966 my $sub = sub {
 967   my ($mod,$args) = @_;
 968   my $orig = "sub$mod\t$args";
 969
 970     if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
 971         my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
 972         $opcode|=(1<<10);       # e1
 973         $opcode|=(1<<8);        # e2
 974         $opcode|=(1<<5);        # d
 975         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
 976     }
 977     else { "\t".$orig; }
 978 };
 979
 980 sub assemble {
 981   my ($mnemonic,$mod,$args)=@_;
 982   my $opcode = eval("\$$mnemonic");
 983
 984     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
 985 }
 986
 987 foreach (split("\n",$code)) {
 988         s/\`([^\`]*)\`/eval $1/ge;
 989         # flip word order in 64-bit mode...
 990         s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
 991         # assemble 2.0 instructions in 32-bit mode...
 992         s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
 993
 994         s/\bbv\b/bve/gm if ($SIZE_T==8);
 995
 996         print $_,"\n";
 997 }
 998 close STDOUT;