bn/asm/sparcv9-mont.pl: iron another glitch in squaring code path.
[openssl.git] / crypto / bn / asm / sparcv9-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # December 2005
11 #
12 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13 # for undertaken effort are multiple. First of all, UltraSPARC is not
14 # the whole SPARCv9 universe and other VIS-free implementations deserve
15 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
16 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18 # several integrated RSA/DSA accelerator circuits accessible through
19 # kernel driver [only(*)], but having decent user-land software
20 # implementation is important too. Finally, reasons like desire to
21 # experiment with dedicated squaring procedure. Yes, this module
22 # implements one, because it was easiest to draft it in SPARCv9
23 # instructions...
24
25 # (*)   Engine accessing the driver in question is on my TODO list.
26 #       For reference, acceleator is estimated to give 6 to 10 times
27 #       improvement on single-threaded RSA sign. It should be noted
28 #       that 6-10x improvement coefficient does not actually mean
29 #       something extraordinary in terms of absolute [single-threaded]
30 #       performance, as SPARCv9 instruction set is by all means least
31 #       suitable for high performance crypto among other 64 bit
32 #       platforms. 6-10x factor simply places T1 in same performance
33 #       domain as say AMD64 and IA-64. Improvement of RSA verify don't
34 #       appear impressive at all, but it's the sign operation which is
35 #       far more critical/interesting.
36
37 # You might notice that inner loops are modulo-scheduled:-) This has
38 # essentially negligible impact on UltraSPARC performance, it's
39 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
40 # the advantage... Currently this module surpasses sparcv9a-mont.pl
41 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42 # module still have hidden potential [see TODO list there], which is
43 # estimated to be larger than 20%...
44
45 # int bn_mul_mont(
46 $rp="%i0";      # BN_ULONG *rp,
47 $ap="%i1";      # const BN_ULONG *ap,
48 $bp="%i2";      # const BN_ULONG *bp,
49 $np="%i3";      # const BN_ULONG *np,
50 $n0="%i4";      # const BN_ULONG *n0,
51 $num="%i5";     # int num);
52
53 $bits=32;
54 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55 if ($bits==64)  { $bias=2047; $frame=192; }
56 else            { $bias=0;    $frame=128; }
57
58 $car0="%o0";
59 $car1="%o1";
60 $car2="%o2";    # 1 bit
61 $acc0="%o3";
62 $acc1="%o4";
63 $mask="%g1";    # 32 bits, what a waste...
64 $tmp0="%g4";
65 $tmp1="%g5";
66
67 $i="%l0";
68 $j="%l1";
69 $mul0="%l2";
70 $mul1="%l3";
71 $tp="%l4";
72 $apj="%l5";
73 $npj="%l6";
74 $tpj="%l7";
75
76 $fname="bn_mul_mont_int";
77
78 $code=<<___;
79 .section        ".text",#alloc,#execinstr
80
81 .global $fname
82 .align  32
83 $fname:
84         cmp     %o5,4                   ! 128 bits minimum
85         bge,pt  %icc,.Lenter
86         sethi   %hi(0xffffffff),$mask
87         retl
88         clr     %o0
89 .align  32
90 .Lenter:
91         save    %sp,-$frame,%sp
92         sll     $num,2,$num             ! num*=4
93         or      $mask,%lo(0xffffffff),$mask
94         ld      [$n0],$n0
95         cmp     $ap,$bp
96         and     $num,$mask,$num
97         ld      [$bp],$mul0             ! bp[0]
98         nop
99
100         add     %sp,$bias,%o7           ! real top of stack
101         ld      [$ap],$car0             ! ap[0] ! redundant in squaring context
102         sub     %o7,$num,%o7
103         ld      [$ap+4],$apj            ! ap[1]
104         and     %o7,-1024,%o7
105         ld      [$np],$car1             ! np[0]
106         sub     %o7,$bias,%sp           ! alloca
107         ld      [$np+4],$npj            ! np[1]
108         be,pt   `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
109         mov     12,$j
110
111         mulx    $car0,$mul0,$car0       ! ap[0]*bp[0]
112         mulx    $apj,$mul0,$tmp0        !prologue! ap[1]*bp[0]
113         and     $car0,$mask,$acc0
114         add     %sp,$bias+$frame,$tp
115         ld      [$ap+8],$apj            !prologue!
116
117         mulx    $n0,$acc0,$mul1         ! "t[0]"*n0
118         and     $mul1,$mask,$mul1
119
120         mulx    $car1,$mul1,$car1       ! np[0]*"t[0]"*n0
121         mulx    $npj,$mul1,$acc1        !prologue! np[1]*"t[0]"*n0
122         srlx    $car0,32,$car0
123         add     $acc0,$car1,$car1
124         ld      [$np+8],$npj            !prologue!
125         srlx    $car1,32,$car1
126         mov     $tmp0,$acc0             !prologue!
127
128 .L1st:
129         mulx    $apj,$mul0,$tmp0
130         mulx    $npj,$mul1,$tmp1
131         add     $acc0,$car0,$car0
132         ld      [$ap+$j],$apj           ! ap[j]
133         and     $car0,$mask,$acc0
134         add     $acc1,$car1,$car1
135         ld      [$np+$j],$npj           ! np[j]
136         srlx    $car0,32,$car0
137         add     $acc0,$car1,$car1
138         add     $j,4,$j                 ! j++
139         mov     $tmp0,$acc0
140         st      $car1,[$tp]
141         cmp     $j,$num
142         mov     $tmp1,$acc1
143         srlx    $car1,32,$car1
144         bl      %icc,.L1st
145         add     $tp,4,$tp               ! tp++
146 !.L1st
147
148         mulx    $apj,$mul0,$tmp0        !epilogue!
149         mulx    $npj,$mul1,$tmp1
150         add     $acc0,$car0,$car0
151         and     $car0,$mask,$acc0
152         add     $acc1,$car1,$car1
153         srlx    $car0,32,$car0
154         add     $acc0,$car1,$car1
155         st      $car1,[$tp]
156         srlx    $car1,32,$car1
157
158         add     $tmp0,$car0,$car0
159         and     $car0,$mask,$acc0
160         add     $tmp1,$car1,$car1
161         srlx    $car0,32,$car0
162         add     $acc0,$car1,$car1
163         st      $car1,[$tp+4]
164         srlx    $car1,32,$car1
165
166         add     $car0,$car1,$car1
167         st      $car1,[$tp+8]
168         srlx    $car1,32,$car2
169 \f
170         mov     4,$i                    ! i++
171         ld      [$bp+4],$mul0           ! bp[1]
172 .Louter:
173         add     %sp,$bias+$frame,$tp
174         ld      [$ap],$car0             ! ap[0]
175         ld      [$ap+4],$apj            ! ap[1]
176         ld      [$np],$car1             ! np[0]
177         ld      [$np+4],$npj            ! np[1]
178         ld      [$tp],$tmp1             ! tp[0]
179         ld      [$tp+4],$tpj            ! tp[1]
180         mov     12,$j
181
182         mulx    $car0,$mul0,$car0
183         mulx    $apj,$mul0,$tmp0        !prologue!
184         add     $tmp1,$car0,$car0
185         ld      [$ap+8],$apj            !prologue!
186         and     $car0,$mask,$acc0
187
188         mulx    $n0,$acc0,$mul1
189         and     $mul1,$mask,$mul1
190
191         mulx    $car1,$mul1,$car1
192         mulx    $npj,$mul1,$acc1        !prologue!
193         srlx    $car0,32,$car0
194         add     $acc0,$car1,$car1
195         ld      [$np+8],$npj            !prologue!
196         srlx    $car1,32,$car1
197         mov     $tmp0,$acc0             !prologue!
198
199 .Linner:
200         mulx    $apj,$mul0,$tmp0
201         mulx    $npj,$mul1,$tmp1
202         add     $tpj,$car0,$car0
203         ld      [$ap+$j],$apj           ! ap[j]
204         add     $acc0,$car0,$car0
205         add     $acc1,$car1,$car1
206         ld      [$np+$j],$npj           ! np[j]
207         and     $car0,$mask,$acc0
208         ld      [$tp+8],$tpj            ! tp[j]
209         srlx    $car0,32,$car0
210         add     $acc0,$car1,$car1
211         add     $j,4,$j                 ! j++
212         mov     $tmp0,$acc0
213         st      $car1,[$tp]             ! tp[j-1]
214         srlx    $car1,32,$car1
215         mov     $tmp1,$acc1
216         cmp     $j,$num
217         bl      %icc,.Linner
218         add     $tp,4,$tp               ! tp++
219 !.Linner
220
221         mulx    $apj,$mul0,$tmp0        !epilogue!
222         mulx    $npj,$mul1,$tmp1
223         add     $tpj,$car0,$car0
224         add     $acc0,$car0,$car0
225         ld      [$tp+8],$tpj            ! tp[j]
226         and     $car0,$mask,$acc0
227         add     $acc1,$car1,$car1
228         srlx    $car0,32,$car0
229         add     $acc0,$car1,$car1
230         st      $car1,[$tp]             ! tp[j-1]
231         srlx    $car1,32,$car1
232
233         add     $tpj,$car0,$car0
234         add     $tmp0,$car0,$car0
235         and     $car0,$mask,$acc0
236         add     $tmp1,$car1,$car1
237         add     $acc0,$car1,$car1
238         st      $car1,[$tp+4]           ! tp[j-1]
239         srlx    $car0,32,$car0
240         add     $i,4,$i                 ! i++
241         srlx    $car1,32,$car1
242
243         add     $car0,$car1,$car1
244         cmp     $i,$num
245         add     $car2,$car1,$car1
246         st      $car1,[$tp+8]
247
248         srlx    $car1,32,$car2
249         bl,a    %icc,.Louter
250         ld      [$bp+$i],$mul0          ! bp[i]
251 !.Louter
252
253         add     $tp,12,$tp
254 \f
255 .Ltail:
256         add     $np,$num,$np
257         add     $rp,$num,$rp
258         sub     %g0,$num,%o7            ! k=-num
259         ba      .Lsub
260         subcc   %g0,%g0,%g0             ! clear %icc.c
261 .align  16
262 .Lsub:
263         ld      [$tp+%o7],%o0
264         ld      [$np+%o7],%o1
265         subccc  %o0,%o1,%o1             ! tp[j]-np[j]
266         add     $rp,%o7,$i
267         add     %o7,4,%o7
268         brnz    %o7,.Lsub
269         st      %o1,[$i]
270         subccc  $car2,0,$car2           ! handle upmost overflow bit
271         sub     %g0,$num,%o7
272
273 .Lcopy:
274         ld      [$tp+%o7],%o1           ! conditional copy
275         ld      [$rp+%o7],%o0
276         st      %g0,[$tp+%o7]           ! zap tp
277         movcs   %icc,%o1,%o0
278         st      %o0,[$rp+%o7]
279         add     %o7,4,%o7
280         brnz    %o7,.Lcopy
281         nop
282         mov     1,%i0
283         ret
284         restore
285 ___
286 \f
287 ########
288 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
289 ######## code without following dedicated squaring procedure.
290 ########
291 $sbit="%o5";
292
293 $code.=<<___;
294 .align  32
295 .Lbn_sqr_mont:
296         mulx    $mul0,$mul0,$car0               ! ap[0]*ap[0]
297         mulx    $apj,$mul0,$tmp0                !prologue!
298         and     $car0,$mask,$acc0
299         add     %sp,$bias+$frame,$tp
300         ld      [$ap+8],$apj                    !prologue!
301
302         mulx    $n0,$acc0,$mul1                 ! "t[0]"*n0
303         srlx    $car0,32,$car0
304         and     $mul1,$mask,$mul1
305
306         mulx    $car1,$mul1,$car1               ! np[0]*"t[0]"*n0
307         mulx    $npj,$mul1,$acc1                !prologue!
308         and     $car0,1,$sbit
309         ld      [$np+8],$npj                    !prologue!
310         srlx    $car0,1,$car0
311         add     $acc0,$car1,$car1
312         srlx    $car1,32,$car1
313         mov     $tmp0,$acc0                     !prologue!
314
315 .Lsqr_1st:
316         mulx    $apj,$mul0,$tmp0
317         mulx    $npj,$mul1,$tmp1
318         add     $acc0,$car0,$car0               ! ap[j]*a0+c0
319         add     $acc1,$car1,$car1
320         ld      [$ap+$j],$apj                   ! ap[j]
321         and     $car0,$mask,$acc0
322         ld      [$np+$j],$npj                   ! np[j]
323         srlx    $car0,32,$car0
324         add     $acc0,$acc0,$acc0
325         or      $sbit,$acc0,$acc0
326         mov     $tmp1,$acc1
327         srlx    $acc0,32,$sbit
328         add     $j,4,$j                         ! j++
329         and     $acc0,$mask,$acc0
330         cmp     $j,$num
331         add     $acc0,$car1,$car1
332         st      $car1,[$tp]
333         mov     $tmp0,$acc0
334         srlx    $car1,32,$car1
335         bl      %icc,.Lsqr_1st
336         add     $tp,4,$tp                       ! tp++
337 !.Lsqr_1st
338
339         mulx    $apj,$mul0,$tmp0                ! epilogue
340         mulx    $npj,$mul1,$tmp1
341         add     $acc0,$car0,$car0               ! ap[j]*a0+c0
342         add     $acc1,$car1,$car1
343         and     $car0,$mask,$acc0
344         srlx    $car0,32,$car0
345         add     $acc0,$acc0,$acc0
346         or      $sbit,$acc0,$acc0
347         srlx    $acc0,32,$sbit
348         and     $acc0,$mask,$acc0
349         add     $acc0,$car1,$car1
350         st      $car1,[$tp]
351         srlx    $car1,32,$car1
352
353         add     $tmp0,$car0,$car0               ! ap[j]*a0+c0
354         add     $tmp1,$car1,$car1
355         and     $car0,$mask,$acc0
356         srlx    $car0,32,$car0
357         add     $acc0,$acc0,$acc0
358         or      $sbit,$acc0,$acc0
359         srlx    $acc0,32,$sbit
360         and     $acc0,$mask,$acc0
361         add     $acc0,$car1,$car1
362         st      $car1,[$tp+4]
363         srlx    $car1,32,$car1
364
365         add     $car0,$car0,$car0
366         or      $sbit,$car0,$car0
367         add     $car0,$car1,$car1
368         st      $car1,[$tp+8]
369         srlx    $car1,32,$car2
370 \f
371         ld      [%sp+$bias+$frame],$tmp0        ! tp[0]
372         ld      [%sp+$bias+$frame+4],$tmp1      ! tp[1]
373         ld      [%sp+$bias+$frame+8],$tpj       ! tp[2]
374         ld      [$ap+4],$mul0                   ! ap[1]
375         ld      [$ap+8],$apj                    ! ap[2]
376         ld      [$np],$car1                     ! np[0]
377         ld      [$np+4],$npj                    ! np[1]
378         mulx    $n0,$tmp0,$mul1
379
380         mulx    $mul0,$mul0,$car0
381         and     $mul1,$mask,$mul1
382
383         mulx    $car1,$mul1,$car1
384         mulx    $npj,$mul1,$acc1
385         add     $tmp0,$car1,$car1
386         and     $car0,$mask,$acc0
387         ld      [$np+8],$npj                    ! np[2]
388         srlx    $car1,32,$car1
389         add     $tmp1,$car1,$car1
390         srlx    $car0,32,$car0
391         add     $acc0,$car1,$car1
392         and     $car0,1,$sbit
393         add     $acc1,$car1,$car1
394         srlx    $car0,1,$car0
395         mov     12,$j
396         st      $car1,[%sp+$bias+$frame]        ! tp[0]=
397         srlx    $car1,32,$car1
398         add     %sp,$bias+$frame+4,$tp
399
400 .Lsqr_2nd:
401         mulx    $apj,$mul0,$acc0
402         mulx    $npj,$mul1,$acc1
403         add     $acc0,$car0,$car0
404         add     $tpj,$sbit,$sbit
405         ld      [$ap+$j],$apj                   ! ap[j]
406         and     $car0,$mask,$acc0
407         ld      [$np+$j],$npj                   ! np[j]
408         srlx    $car0,32,$car0
409         add     $acc1,$car1,$car1
410         ld      [$tp+8],$tpj                    ! tp[j]
411         add     $acc0,$acc0,$acc0
412         add     $j,4,$j                         ! j++
413         add     $sbit,$acc0,$acc0
414         srlx    $acc0,32,$sbit
415         and     $acc0,$mask,$acc0
416         cmp     $j,$num
417         add     $acc0,$car1,$car1
418         st      $car1,[$tp]                     ! tp[j-1]
419         srlx    $car1,32,$car1
420         bl      %icc,.Lsqr_2nd
421         add     $tp,4,$tp                       ! tp++
422 !.Lsqr_2nd
423
424         mulx    $apj,$mul0,$acc0
425         mulx    $npj,$mul1,$acc1
426         add     $acc0,$car0,$car0
427         add     $tpj,$sbit,$sbit
428         and     $car0,$mask,$acc0
429         srlx    $car0,32,$car0
430         add     $acc1,$car1,$car1
431         add     $acc0,$acc0,$acc0
432         add     $sbit,$acc0,$acc0
433         srlx    $acc0,32,$sbit
434         and     $acc0,$mask,$acc0
435         add     $acc0,$car1,$car1
436         st      $car1,[$tp]                     ! tp[j-1]
437         srlx    $car1,32,$car1
438
439         add     $car0,$car0,$car0
440         add     $sbit,$car0,$car0
441         add     $car0,$car1,$car1
442         add     $car2,$car1,$car1
443         st      $car1,[$tp+4]
444         srlx    $car1,32,$car2
445 \f
446         ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
447         ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
448         ld      [$ap+8],$mul0                   ! ap[2]
449         ld      [$np],$car1                     ! np[0]
450         ld      [$np+4],$npj                    ! np[1]
451         mulx    $n0,$tmp1,$mul1
452         and     $mul1,$mask,$mul1
453         mov     8,$i
454
455         mulx    $mul0,$mul0,$car0
456         mulx    $car1,$mul1,$car1
457         and     $car0,$mask,$acc0
458         add     $tmp1,$car1,$car1
459         srlx    $car0,32,$car0
460         add     %sp,$bias+$frame,$tp
461         srlx    $car1,32,$car1
462         and     $car0,1,$sbit
463         srlx    $car0,1,$car0
464         mov     4,$j
465
466 .Lsqr_outer:
467 .Lsqr_inner1:
468         mulx    $npj,$mul1,$acc1
469         add     $tpj,$car1,$car1
470         add     $j,4,$j
471         ld      [$tp+8],$tpj
472         cmp     $j,$i
473         add     $acc1,$car1,$car1
474         ld      [$np+$j],$npj
475         st      $car1,[$tp]
476         srlx    $car1,32,$car1
477         bl      %icc,.Lsqr_inner1
478         add     $tp,4,$tp
479 !.Lsqr_inner1
480
481         add     $j,4,$j
482         ld      [$ap+$j],$apj                   ! ap[j]
483         mulx    $npj,$mul1,$acc1
484         add     $tpj,$car1,$car1
485         ld      [$np+$j],$npj                   ! np[j]
486         srlx    $car1,32,$tmp0
487         and     $car1,$mask,$car1
488         add     $tmp0,$sbit,$sbit
489         add     $acc0,$car1,$car1
490         ld      [$tp+8],$tpj                    ! tp[j]
491         add     $acc1,$car1,$car1
492         st      $car1,[$tp]
493         srlx    $car1,32,$car1
494
495         add     $j,4,$j
496         cmp     $j,$num
497         be,pn   %icc,.Lsqr_no_inner2
498         add     $tp,4,$tp
499
500 .Lsqr_inner2:
501         mulx    $apj,$mul0,$acc0
502         mulx    $npj,$mul1,$acc1
503         add     $tpj,$sbit,$sbit
504         add     $acc0,$car0,$car0
505         ld      [$ap+$j],$apj                   ! ap[j]
506         and     $car0,$mask,$acc0
507         ld      [$np+$j],$npj                   ! np[j]
508         srlx    $car0,32,$car0
509         add     $acc0,$acc0,$acc0
510         ld      [$tp+8],$tpj                    ! tp[j]
511         add     $sbit,$acc0,$acc0
512         add     $j,4,$j                         ! j++
513         srlx    $acc0,32,$sbit
514         and     $acc0,$mask,$acc0
515         cmp     $j,$num
516         add     $acc0,$car1,$car1
517         add     $acc1,$car1,$car1
518         st      $car1,[$tp]                     ! tp[j-1]
519         srlx    $car1,32,$car1
520         bl      %icc,.Lsqr_inner2
521         add     $tp,4,$tp                       ! tp++
522
523 .Lsqr_no_inner2:
524         mulx    $apj,$mul0,$acc0
525         mulx    $npj,$mul1,$acc1
526         add     $tpj,$sbit,$sbit
527         add     $acc0,$car0,$car0
528         and     $car0,$mask,$acc0
529         srlx    $car0,32,$car0
530         add     $acc0,$acc0,$acc0
531         add     $sbit,$acc0,$acc0
532         srlx    $acc0,32,$sbit
533         and     $acc0,$mask,$acc0
534         add     $acc0,$car1,$car1
535         add     $acc1,$car1,$car1
536         st      $car1,[$tp]                     ! tp[j-1]
537         srlx    $car1,32,$car1
538
539         add     $car0,$car0,$car0
540         add     $sbit,$car0,$car0
541         add     $car0,$car1,$car1
542         add     $car2,$car1,$car1
543         st      $car1,[$tp+4]
544         srlx    $car1,32,$car2
545 \f
546         add     $i,4,$i                         ! i++
547         ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
548         ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
549         ld      [$ap+$i],$mul0                  ! ap[j]
550         ld      [$np],$car1                     ! np[0]
551         ld      [$np+4],$npj                    ! np[1]
552         mulx    $n0,$tmp1,$mul1
553         and     $mul1,$mask,$mul1
554         add     $i,4,$tmp0
555
556         mulx    $mul0,$mul0,$car0
557         mulx    $car1,$mul1,$car1
558         and     $car0,$mask,$acc0
559         add     $tmp1,$car1,$car1
560         srlx    $car0,32,$car0
561         add     %sp,$bias+$frame,$tp
562         srlx    $car1,32,$car1
563         and     $car0,1,$sbit
564         srlx    $car0,1,$car0
565
566         cmp     $tmp0,$num                      ! i<num-1
567         bl      %icc,.Lsqr_outer
568         mov     4,$j
569 \f
570 .Lsqr_last:
571         mulx    $npj,$mul1,$acc1
572         add     $tpj,$car1,$car1
573         add     $j,4,$j
574         ld      [$tp+8],$tpj
575         cmp     $j,$i
576         add     $acc1,$car1,$car1
577         ld      [$np+$j],$npj
578         st      $car1,[$tp]
579         srlx    $car1,32,$car1
580         bl      %icc,.Lsqr_last
581         add     $tp,4,$tp
582 !.Lsqr_last
583
584         mulx    $npj,$mul1,$acc1
585         add     $tpj,$acc0,$acc0
586         srlx    $acc0,32,$tmp0
587         and     $acc0,$mask,$acc0
588         add     $tmp0,$sbit,$sbit
589         add     $acc0,$car1,$car1
590         add     $acc1,$car1,$car1
591         st      $car1,[$tp]
592         srlx    $car1,32,$car1
593
594         add     $car0,$car0,$car0               ! recover $car0
595         add     $sbit,$car0,$car0
596         add     $car0,$car1,$car1
597         add     $car2,$car1,$car1
598         st      $car1,[$tp+4]
599         srlx    $car1,32,$car2
600
601         ba      .Ltail
602         add     $tp,8,$tp
603 .type   $fname,#function
604 .size   $fname,(.-$fname)
605 .asciz  "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
606 .align  32
607 ___
608 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
609 print $code;
610 close STDOUT;