2e12eeb578e8232fa477cb891404ad21508890b4
[openssl.git] / crypto / bn / asm / sparcv9-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
8
9 # December 2005
10 #
11 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
12 # for undertaken effort are multiple. First of all, UltraSPARC is not
13 # the whole SPARCv9 universe and other VIS-free implementations deserve
14 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
15 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
16 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
17 # several integrated RSA/DSA accelerator circuits accessible through
18 # kernel driver [only(*)], but having decent user-land software
19 # implementation is important too. Finally, reasons like desire to
20 # experiment with dedicated squaring procedure. Yes, this module
21 # implements one, because it was easiest to draft it in SPARCv9
22 # instructions...
23
24 # (*)   Engine accessing the driver in question is on my TODO list.
25 #       For reference, acceleator is estimated to give 6 to 10 times
26 #       improvement on single-threaded RSA sign. It should be noted
27 #       that 6-10x improvement coefficient does not actually mean
28 #       something extraordinary in terms of absolute [single-threaded]
29 #       performance, as SPARCv9 instruction set is by all means least
30 #       suitable for high performance crypto among other 64 bit
31 #       platforms. 6-10x factor simply places T1 in same performance
32 #       domain as say AMD64 and IA-64. Improvement of RSA verify don't
33 #       appear impressive at all, but it's the sign operation which is
34 #       far more critical/interesting.
35
36 # You might notice that inner loops are modulo-scheduled:-) This has
37 # essentially negligible impact on UltraSPARC performance, it's
38 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
39 # the advantage... Currently this module surpasses sparcv9a-mont.pl
40 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
41 # module still have hidden potential [see TODO list there], which is
42 # estimated to be larger than 20%...
43
44 # int bn_mul_mont(
45 $rp="%i0";      # BN_ULONG *rp,
46 $ap="%i1";      # const BN_ULONG *ap,
47 $bp="%i2";      # const BN_ULONG *bp,
48 $np="%i3";      # const BN_ULONG *np,
49 $n0="%i4";      # const BN_ULONG *n0,
50 $num="%i5";     # int num);
51
52 $bits=32;
53 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
54 if ($bits==64)  { $bias=2047; $frame=192; }
55 else            { $bias=0;    $frame=128; }
56
57 $car0="%o0";
58 $car1="%o1";
59 $car2="%o2";    # 1 bit
60 $acc0="%o3";
61 $acc1="%o4";
62 $mask="%g1";    # 32 bits, what a waste...
63 $tmp0="%g4";
64 $tmp1="%g5";
65
66 $i="%l0";
67 $j="%l1";
68 $mul0="%l2";
69 $mul1="%l3";
70 $tp="%l4";
71 $apj="%l5";
72 $npj="%l6";
73 $tpj="%l7";
74
75 $fname="bn_mul_mont_int";
76
77 $code=<<___;
78 .section        ".text",#alloc,#execinstr
79
80 .global $fname
81 .align  32
82 $fname:
83         cmp     %o5,4                   ! 128 bits minimum
84         bge,pt  %icc,.Lenter
85         sethi   %hi(0xffffffff),$mask
86         retl
87         clr     %o0
88 .align  32
89 .Lenter:
90         save    %sp,-$frame,%sp
91         sll     $num,2,$num             ! num*=4
92         or      $mask,%lo(0xffffffff),$mask
93         ld      [$n0],$n0
94         cmp     $ap,$bp
95         and     $num,$mask,$num
96         ld      [$bp],$mul0             ! bp[0]
97         be,pt   `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
98         nop
99
100         add     %sp,$bias,%o7           ! real top of stack
101         ld      [$ap],$car0             ! ap[0]
102         sub     %o7,$num,%o7
103         ld      [$ap+4],$apj            ! ap[1]
104         and     %o7,-1024,%o7
105         ld      [$np],$car1             ! np[0]
106         sub     %o7,$bias,%sp           ! alloca
107         ld      [$np+4],$npj            ! np[1]
108         mov     12,$j
109
110         mulx    $car0,$mul0,$car0       ! ap[0]*bp[0]
111         mulx    $apj,$mul0,$tmp0        !prologue! ap[1]*bp[0]
112         and     $car0,$mask,$acc0
113         add     %sp,$bias+$frame,$tp
114         ld      [$ap+8],$apj            !prologue!
115
116         mulx    $n0,$acc0,$mul1         ! "t[0]"*n0
117         and     $mul1,$mask,$mul1
118
119         mulx    $car1,$mul1,$car1       ! np[0]*"t[0]"*n0
120         mulx    $npj,$mul1,$acc1        !prologue! np[1]*"t[0]"*n0
121         srlx    $car0,32,$car0
122         add     $acc0,$car1,$car1
123         ld      [$np+8],$npj            !prologue!
124         srlx    $car1,32,$car1
125         mov     $tmp0,$acc0             !prologue!
126
127 .L1st:
128         mulx    $apj,$mul0,$tmp0
129         mulx    $npj,$mul1,$tmp1
130         add     $acc0,$car0,$car0
131         ld      [$ap+$j],$apj           ! ap[j]
132         and     $car0,$mask,$acc0
133         add     $acc1,$car1,$car1
134         ld      [$np+$j],$npj           ! np[j]
135         srlx    $car0,32,$car0
136         add     $acc0,$car1,$car1
137         add     $j,4,$j                 ! j++
138         mov     $tmp0,$acc0
139         st      $car1,[$tp]
140         cmp     $j,$num
141         mov     $tmp1,$acc1
142         srlx    $car1,32,$car1
143         bl      %icc,.L1st
144         add     $tp,4,$tp               ! tp++
145 !.L1st
146
147         mulx    $apj,$mul0,$tmp0        !epilogue!
148         mulx    $npj,$mul1,$tmp1
149         add     $acc0,$car0,$car0
150         and     $car0,$mask,$acc0
151         add     $acc1,$car1,$car1
152         srlx    $car0,32,$car0
153         add     $acc0,$car1,$car1
154         st      $car1,[$tp]
155         srlx    $car1,32,$car1
156
157         add     $tmp0,$car0,$car0
158         and     $car0,$mask,$acc0
159         add     $tmp1,$car1,$car1
160         srlx    $car0,32,$car0
161         add     $acc0,$car1,$car1
162         st      $car1,[$tp+4]
163         srlx    $car1,32,$car1
164
165         add     $car0,$car1,$car1
166         st      $car1,[$tp+8]
167         srlx    $car1,32,$car2
168 \f
169         mov     4,$i                    ! i++
170         ld      [$bp+4],$mul0           ! bp[1]
171 .Louter:
172         add     %sp,$bias+$frame,$tp
173         ld      [$ap],$car0             ! ap[0]
174         ld      [$ap+4],$apj            ! ap[1]
175         ld      [$np],$car1             ! np[0]
176         ld      [$np+4],$npj            ! np[1]
177         ld      [$tp],$tmp1             ! tp[0]
178         ld      [$tp+4],$tpj            ! tp[1]
179         mov     12,$j
180
181         mulx    $car0,$mul0,$car0
182         mulx    $apj,$mul0,$tmp0        !prologue!
183         add     $tmp1,$car0,$car0
184         ld      [$ap+8],$apj            !prologue!
185         and     $car0,$mask,$acc0
186
187         mulx    $n0,$acc0,$mul1
188         and     $mul1,$mask,$mul1
189
190         mulx    $car1,$mul1,$car1
191         mulx    $npj,$mul1,$acc1        !prologue!
192         srlx    $car0,32,$car0
193         add     $acc0,$car1,$car1
194         ld      [$np+8],$npj            !prologue!
195         srlx    $car1,32,$car1
196         mov     $tmp0,$acc0             !prologue!
197
198 .Linner:
199         mulx    $apj,$mul0,$tmp0
200         mulx    $npj,$mul1,$tmp1
201         add     $tpj,$car0,$car0
202         ld      [$ap+$j],$apj           ! ap[j]
203         add     $acc0,$car0,$car0
204         add     $acc1,$car1,$car1
205         ld      [$np+$j],$npj           ! np[j]
206         and     $car0,$mask,$acc0
207         ld      [$tp+8],$tpj            ! tp[j]
208         srlx    $car0,32,$car0
209         add     $acc0,$car1,$car1
210         add     $j,4,$j                 ! j++
211         mov     $tmp0,$acc0
212         st      $car1,[$tp]             ! tp[j-1]
213         srlx    $car1,32,$car1
214         mov     $tmp1,$acc1
215         cmp     $j,$num
216         bl      %icc,.Linner
217         add     $tp,4,$tp               ! tp++
218 !.Linner
219
220         mulx    $apj,$mul0,$tmp0        !epilogue!
221         mulx    $npj,$mul1,$tmp1
222         add     $tpj,$car0,$car0
223         add     $acc0,$car0,$car0
224         ld      [$tp+8],$tpj            ! tp[j]
225         and     $car0,$mask,$acc0
226         add     $acc1,$car1,$car1
227         srlx    $car0,32,$car0
228         add     $acc0,$car1,$car1
229         st      $car1,[$tp]             ! tp[j-1]
230         srlx    $car1,32,$car1
231
232         add     $tpj,$car0,$car0
233         add     $tmp0,$car0,$car0
234         and     $car0,$mask,$acc0
235         add     $tmp1,$car1,$car1
236         add     $acc0,$car1,$car1
237         st      $car1,[$tp+4]           ! tp[j-1]
238         srlx    $car0,32,$car0
239         add     $i,4,$i                 ! i++
240         srlx    $car1,32,$car1
241
242         add     $car0,$car1,$car1
243         cmp     $i,$num
244         add     $car2,$car1,$car1
245         st      $car1,[$tp+8]
246
247         srlx    $car1,32,$car2
248         bl,a    %icc,.Louter
249         ld      [$bp+$i],$mul0          ! bp[i]
250 !.Louter
251
252         add     $tp,12,$tp
253 \f
254 .Ltail:
255         add     $np,$num,$np
256         add     $rp,$num,$rp
257
258         cmp     $car2,0                 ! clears %icc.c
259         bne,pn  %icc,.Lsub
260         sub     %g0,$num,%o7            ! k=-num
261
262         cmp     $car1,$npj              ! compare top-most $tp and $np words
263         bcs,pt  %icc,.Lcopy             ! %icc.c is clean if not taken
264         nop
265
266 .align  16,0x1000000
267 .Lsub:
268         ld      [$tp+%o7],%o0
269         ld      [$np+%o7],%o1
270         subccc  %o0,%o1,%o1
271         st      %o1,[$rp+%o7]
272         add     %o7,4,%o7
273         brnz    %o7,.Lsub
274         nop
275         subccc  $car2,0,$car2
276         bcc     %icc,.Lzap
277         sub     %g0,$num,%o7
278
279 .align  16,0x1000000
280 .Lcopy:
281         ld      [$tp+%o7],%o0
282         st      %o0,[$rp+%o7]
283         add     %o7,4,%o7
284         brnz    %o7,.Lcopy
285         nop
286         ba      .Lzap
287         sub     %g0,$num,%o7
288
289 .align  32
290 .Lzap:
291         st      %g0,[$tp+%o7]
292         add     %o7,4,%o7
293         brnz    %o7,.Lzap
294         nop
295         mov     1,%i0
296         ret
297         restore
298 ___
299 \f
300 ########
301 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
302 ######## code without following dedicated squaring procedure.
303 ########
304 $sbit="%i2";            # re-use $bp!
305
306 $code.=<<___;
307 .align  32
308 .Lbn_sqr_mont:
309         add     %sp,$bias,%o7                   ! real top of stack
310         ld      [$ap+4],$apj                    ! ap[1]
311         sub     %o7,$num,%o7
312         ld      [$np],$car1                     ! np[0]
313         and     %o7,-1024,%o7
314         ld      [$np+4],$npj                    ! np[1]
315         sub     %o7,$bias,%sp                   ! alloca
316         mov     12,$j
317
318         mulx    $mul0,$mul0,$car0               ! ap[0]*ap[0]
319         mulx    $apj,$mul0,$tmp0                !prologue!
320         and     $car0,$mask,$acc0
321         add     %sp,$bias+$frame,$tp
322         ld      [$ap+8],$apj                    !prologue!
323
324         mulx    $n0,$acc0,$mul1                 ! "t[0]"*n0
325         srlx    $car0,32,$car0
326         and     $mul1,$mask,$mul1
327
328         mulx    $car1,$mul1,$car1               ! np[0]*"t[0]"*n0
329         mulx    $npj,$mul1,$acc1                !prologue!
330         and     $car0,1,$sbit
331         ld      [$np+8],$npj                    !prologue!
332         srlx    $car0,1,$car0
333         add     $acc0,$car1,$car1
334         srlx    $car1,32,$car1
335         mov     $tmp0,$acc0                     !prologue!
336
337 .Lsqr_1st:
338         mulx    $apj,$mul0,$tmp0
339         mulx    $npj,$mul1,$tmp1
340         add     $acc0,$car0,$car0               ! ap[j]*a0+c0
341         add     $acc1,$car1,$car1
342         ld      [$ap+$j],$apj                   ! ap[j]
343         and     $car0,$mask,$acc0
344         ld      [$np+$j],$npj                   ! np[j]
345         srlx    $car0,32,$car0
346         add     $acc0,$acc0,$acc0
347         or      $sbit,$acc0,$acc0
348         mov     $tmp1,$acc1
349         srlx    $acc0,32,$sbit
350         add     $j,4,$j                         ! j++
351         and     $acc0,$mask,$acc0
352         cmp     $j,$num
353         add     $acc0,$car1,$car1
354         st      $car1,[$tp]
355         mov     $tmp0,$acc0
356         srlx    $car1,32,$car1
357         bl      %icc,.Lsqr_1st
358         add     $tp,4,$tp                       ! tp++
359 !.Lsqr_1st
360
361         mulx    $apj,$mul0,$tmp0                ! epilogue
362         mulx    $npj,$mul1,$tmp1
363         add     $acc0,$car0,$car0               ! ap[j]*a0+c0
364         add     $acc1,$car1,$car1
365         and     $car0,$mask,$acc0
366         srlx    $car0,32,$car0
367         add     $acc0,$acc0,$acc0
368         or      $sbit,$acc0,$acc0
369         srlx    $acc0,32,$sbit
370         and     $acc0,$mask,$acc0
371         add     $acc0,$car1,$car1
372         st      $car1,[$tp]
373         srlx    $car1,32,$car1
374
375         add     $tmp0,$car0,$car0               ! ap[j]*a0+c0
376         add     $tmp1,$car1,$car1
377         and     $car0,$mask,$acc0
378         srlx    $car0,32,$car0
379         add     $acc0,$acc0,$acc0
380         or      $sbit,$acc0,$acc0
381         srlx    $acc0,32,$sbit
382         and     $acc0,$mask,$acc0
383         add     $acc0,$car1,$car1
384         st      $car1,[$tp+4]
385         srlx    $car1,32,$car1
386
387         add     $car0,$car0,$car0
388         or      $sbit,$car0,$car0
389         add     $car0,$car1,$car1
390         st      $car1,[$tp+8]
391         srlx    $car1,32,$car2
392 \f
393         ld      [%sp+$bias+$frame],$tmp0        ! tp[0]
394         ld      [%sp+$bias+$frame+4],$tmp1      ! tp[1]
395         ld      [%sp+$bias+$frame+8],$tpj       ! tp[2]
396         ld      [$ap+4],$mul0                   ! ap[1]
397         ld      [$ap+8],$apj                    ! ap[2]
398         ld      [$np],$car1                     ! np[0]
399         ld      [$np+4],$npj                    ! np[1]
400         mulx    $n0,$tmp0,$mul1
401
402         mulx    $mul0,$mul0,$car0
403         and     $mul1,$mask,$mul1
404
405         mulx    $car1,$mul1,$car1
406         mulx    $npj,$mul1,$acc1
407         add     $tmp0,$car1,$car1
408         and     $car0,$mask,$acc0
409         ld      [$np+8],$npj                    ! np[2]
410         srlx    $car1,32,$car1
411         add     $tmp1,$car1,$car1
412         srlx    $car0,32,$car0
413         add     $acc0,$car1,$car1
414         and     $car0,1,$sbit
415         add     $acc1,$car1,$car1
416         srlx    $car0,1,$car0
417         mov     12,$j
418         st      $car1,[%sp+$bias+$frame]        ! tp[0]=
419         srlx    $car1,32,$car1
420         add     %sp,$bias+$frame+4,$tp
421
422 .Lsqr_2nd:
423         mulx    $apj,$mul0,$acc0
424         mulx    $npj,$mul1,$acc1
425         add     $acc0,$car0,$car0
426         add     $tpj,$car1,$car1
427         ld      [$ap+$j],$apj                   ! ap[j]
428         and     $car0,$mask,$acc0
429         ld      [$np+$j],$npj                   ! np[j]
430         srlx    $car0,32,$car0
431         add     $acc1,$car1,$car1
432         ld      [$tp+8],$tpj                    ! tp[j]
433         add     $acc0,$acc0,$acc0
434         add     $j,4,$j                         ! j++
435         or      $sbit,$acc0,$acc0
436         srlx    $acc0,32,$sbit
437         and     $acc0,$mask,$acc0
438         cmp     $j,$num
439         add     $acc0,$car1,$car1
440         st      $car1,[$tp]                     ! tp[j-1]
441         srlx    $car1,32,$car1
442         bl      %icc,.Lsqr_2nd
443         add     $tp,4,$tp                       ! tp++
444 !.Lsqr_2nd
445
446         mulx    $apj,$mul0,$acc0
447         mulx    $npj,$mul1,$acc1
448         add     $acc0,$car0,$car0
449         add     $tpj,$car1,$car1
450         and     $car0,$mask,$acc0
451         srlx    $car0,32,$car0
452         add     $acc1,$car1,$car1
453         add     $acc0,$acc0,$acc0
454         or      $sbit,$acc0,$acc0
455         srlx    $acc0,32,$sbit
456         and     $acc0,$mask,$acc0
457         add     $acc0,$car1,$car1
458         st      $car1,[$tp]                     ! tp[j-1]
459         srlx    $car1,32,$car1
460
461         add     $car0,$car0,$car0
462         or      $sbit,$car0,$car0
463         add     $car0,$car1,$car1
464         add     $car2,$car1,$car1
465         st      $car1,[$tp+4]
466         srlx    $car1,32,$car2
467 \f
468         ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
469         ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
470         ld      [$ap+8],$mul0                   ! ap[2]
471         ld      [$np],$car1                     ! np[0]
472         ld      [$np+4],$npj                    ! np[1]
473         mulx    $n0,$tmp1,$mul1
474         and     $mul1,$mask,$mul1
475         mov     8,$i
476
477         mulx    $mul0,$mul0,$car0
478         mulx    $car1,$mul1,$car1
479         and     $car0,$mask,$acc0
480         add     $tmp1,$car1,$car1
481         srlx    $car0,32,$car0
482         add     %sp,$bias+$frame,$tp
483         srlx    $car1,32,$car1
484         and     $car0,1,$sbit
485         srlx    $car0,1,$car0
486         mov     4,$j
487
488 .Lsqr_outer:
489 .Lsqr_inner1:
490         mulx    $npj,$mul1,$acc1
491         add     $tpj,$car1,$car1
492         add     $j,4,$j
493         ld      [$tp+8],$tpj
494         cmp     $j,$i
495         add     $acc1,$car1,$car1
496         ld      [$np+$j],$npj
497         st      $car1,[$tp]
498         srlx    $car1,32,$car1
499         bl      %icc,.Lsqr_inner1
500         add     $tp,4,$tp
501 !.Lsqr_inner1
502
503         add     $j,4,$j
504         ld      [$ap+$j],$apj                   ! ap[j]
505         mulx    $npj,$mul1,$acc1
506         add     $tpj,$car1,$car1
507         ld      [$np+$j],$npj                   ! np[j]
508         add     $acc0,$car1,$car1
509         ld      [$tp+8],$tpj                    ! tp[j]
510         add     $acc1,$car1,$car1
511         st      $car1,[$tp]
512         srlx    $car1,32,$car1
513
514         add     $j,4,$j
515         cmp     $j,$num
516         be,pn   %icc,.Lsqr_no_inner2
517         add     $tp,4,$tp
518
519 .Lsqr_inner2:
520         mulx    $apj,$mul0,$acc0
521         mulx    $npj,$mul1,$acc1
522         add     $tpj,$car1,$car1
523         add     $acc0,$car0,$car0
524         ld      [$ap+$j],$apj                   ! ap[j]
525         and     $car0,$mask,$acc0
526         ld      [$np+$j],$npj                   ! np[j]
527         srlx    $car0,32,$car0
528         add     $acc0,$acc0,$acc0
529         ld      [$tp+8],$tpj                    ! tp[j]
530         or      $sbit,$acc0,$acc0
531         add     $j,4,$j                         ! j++
532         srlx    $acc0,32,$sbit
533         and     $acc0,$mask,$acc0
534         cmp     $j,$num
535         add     $acc0,$car1,$car1
536         add     $acc1,$car1,$car1
537         st      $car1,[$tp]                     ! tp[j-1]
538         srlx    $car1,32,$car1
539         bl      %icc,.Lsqr_inner2
540         add     $tp,4,$tp                       ! tp++
541
542 .Lsqr_no_inner2:
543         mulx    $apj,$mul0,$acc0
544         mulx    $npj,$mul1,$acc1
545         add     $tpj,$car1,$car1
546         add     $acc0,$car0,$car0
547         and     $car0,$mask,$acc0
548         srlx    $car0,32,$car0
549         add     $acc0,$acc0,$acc0
550         or      $sbit,$acc0,$acc0
551         srlx    $acc0,32,$sbit
552         and     $acc0,$mask,$acc0
553         add     $acc0,$car1,$car1
554         add     $acc1,$car1,$car1
555         st      $car1,[$tp]                     ! tp[j-1]
556         srlx    $car1,32,$car1
557
558         add     $car0,$car0,$car0
559         or      $sbit,$car0,$car0
560         add     $car0,$car1,$car1
561         add     $car2,$car1,$car1
562         st      $car1,[$tp+4]
563         srlx    $car1,32,$car2
564 \f
565         add     $i,4,$i                         ! i++
566         ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
567         ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
568         ld      [$ap+$i],$mul0                  ! ap[j]
569         ld      [$np],$car1                     ! np[0]
570         ld      [$np+4],$npj                    ! np[1]
571         mulx    $n0,$tmp1,$mul1
572         and     $mul1,$mask,$mul1
573         add     $i,4,$tmp0
574
575         mulx    $mul0,$mul0,$car0
576         mulx    $car1,$mul1,$car1
577         and     $car0,$mask,$acc0
578         add     $tmp1,$car1,$car1
579         srlx    $car0,32,$car0
580         add     %sp,$bias+$frame,$tp
581         srlx    $car1,32,$car1
582         and     $car0,1,$sbit
583         srlx    $car0,1,$car0
584
585         cmp     $tmp0,$num                      ! i<num-1
586         bl      %icc,.Lsqr_outer
587         mov     4,$j
588 \f
589 .Lsqr_last:
590         mulx    $npj,$mul1,$acc1
591         add     $tpj,$car1,$car1
592         add     $j,4,$j
593         ld      [$tp+8],$tpj
594         cmp     $j,$i
595         add     $acc1,$car1,$car1
596         ld      [$np+$j],$npj
597         st      $car1,[$tp]
598         srlx    $car1,32,$car1
599         bl      %icc,.Lsqr_last
600         add     $tp,4,$tp
601 !.Lsqr_last
602
603         mulx    $npj,$mul1,$acc1
604         add     $tpj,$car1,$car1
605         add     $acc0,$car1,$car1
606         add     $acc1,$car1,$car1
607         st      $car1,[$tp]
608         srlx    $car1,32,$car1
609
610         add     $car0,$car0,$car0               ! recover $car0
611         or      $sbit,$car0,$car0
612         add     $car0,$car1,$car1
613         add     $car2,$car1,$car1
614         st      $car1,[$tp+4]
615         srlx    $car1,32,$car2
616
617         ba      .Ltail
618         add     $tp,8,$tp
619 .type   $fname,#function
620 .size   $fname,(.-$fname)
621 ___
622 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
623 print $code;
624 close STDOUT;