Do not silently truncate files on perlasm errors
[openssl.git] / crypto / bn / asm / sparcv9-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # December 2005
18 #
19 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
20 # for undertaken effort are multiple. First of all, UltraSPARC is not
21 # the whole SPARCv9 universe and other VIS-free implementations deserve
22 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
23 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
24 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
25 # several integrated RSA/DSA accelerator circuits accessible through
26 # kernel driver [only(*)], but having decent user-land software
27 # implementation is important too. Finally, reasons like desire to
28 # experiment with dedicated squaring procedure. Yes, this module
29 # implements one, because it was easiest to draft it in SPARCv9
30 # instructions...
31
32 # (*)   Engine accessing the driver in question is on my TODO list.
33 #       For reference, accelerator is estimated to give 6 to 10 times
34 #       improvement on single-threaded RSA sign. It should be noted
35 #       that 6-10x improvement coefficient does not actually mean
36 #       something extraordinary in terms of absolute [single-threaded]
37 #       performance, as SPARCv9 instruction set is by all means least
38 #       suitable for high performance crypto among other 64 bit
39 #       platforms. 6-10x factor simply places T1 in same performance
40 #       domain as say AMD64 and IA-64. Improvement of RSA verify don't
41 #       appear impressive at all, but it's the sign operation which is
42 #       far more critical/interesting.
43
44 # You might notice that inner loops are modulo-scheduled:-) This has
45 # essentially negligible impact on UltraSPARC performance, it's
46 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
47 # the advantage... Currently this module surpasses sparcv9a-mont.pl
48 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
49 # module still have hidden potential [see TODO list there], which is
50 # estimated to be larger than 20%...
51
52 $output = pop and open STDOUT,">$output";
53
54 # int bn_mul_mont(
55 $rp="%i0";      # BN_ULONG *rp,
56 $ap="%i1";      # const BN_ULONG *ap,
57 $bp="%i2";      # const BN_ULONG *bp,
58 $np="%i3";      # const BN_ULONG *np,
59 $n0="%i4";      # const BN_ULONG *n0,
60 $num="%i5";     # int num);
61
62 $frame="STACK_FRAME";
63 $bias="STACK_BIAS";
64
65 $car0="%o0";
66 $car1="%o1";
67 $car2="%o2";    # 1 bit
68 $acc0="%o3";
69 $acc1="%o4";
70 $mask="%g1";    # 32 bits, what a waste...
71 $tmp0="%g4";
72 $tmp1="%g5";
73
74 $i="%l0";
75 $j="%l1";
76 $mul0="%l2";
77 $mul1="%l3";
78 $tp="%l4";
79 $apj="%l5";
80 $npj="%l6";
81 $tpj="%l7";
82
83 $fname="bn_mul_mont_int";
84
85 $code=<<___;
86 #include "sparc_arch.h"
87
88 .section        ".text",#alloc,#execinstr
89
90 .global $fname
91 .align  32
92 $fname:
93         cmp     %o5,4                   ! 128 bits minimum
94         bge,pt  %icc,.Lenter
95         sethi   %hi(0xffffffff),$mask
96         retl
97         clr     %o0
98 .align  32
99 .Lenter:
100         save    %sp,-$frame,%sp
101         sll     $num,2,$num             ! num*=4
102         or      $mask,%lo(0xffffffff),$mask
103         ld      [$n0],$n0
104         cmp     $ap,$bp
105         and     $num,$mask,$num
106         ld      [$bp],$mul0             ! bp[0]
107         nop
108
109         add     %sp,$bias,%o7           ! real top of stack
110         ld      [$ap],$car0             ! ap[0] ! redundant in squaring context
111         sub     %o7,$num,%o7
112         ld      [$ap+4],$apj            ! ap[1]
113         and     %o7,-1024,%o7
114         ld      [$np],$car1             ! np[0]
115         sub     %o7,$bias,%sp           ! alloca
116         ld      [$np+4],$npj            ! np[1]
117         be,pt   SIZE_T_CC,.Lbn_sqr_mont
118         mov     12,$j
119
120         mulx    $car0,$mul0,$car0       ! ap[0]*bp[0]
121         mulx    $apj,$mul0,$tmp0        !prologue! ap[1]*bp[0]
122         and     $car0,$mask,$acc0
123         add     %sp,$bias+$frame,$tp
124         ld      [$ap+8],$apj            !prologue!
125
126         mulx    $n0,$acc0,$mul1         ! "t[0]"*n0
127         and     $mul1,$mask,$mul1
128
129         mulx    $car1,$mul1,$car1       ! np[0]*"t[0]"*n0
130         mulx    $npj,$mul1,$acc1        !prologue! np[1]*"t[0]"*n0
131         srlx    $car0,32,$car0
132         add     $acc0,$car1,$car1
133         ld      [$np+8],$npj            !prologue!
134         srlx    $car1,32,$car1
135         mov     $tmp0,$acc0             !prologue!
136
137 .L1st:
138         mulx    $apj,$mul0,$tmp0
139         mulx    $npj,$mul1,$tmp1
140         add     $acc0,$car0,$car0
141         ld      [$ap+$j],$apj           ! ap[j]
142         and     $car0,$mask,$acc0
143         add     $acc1,$car1,$car1
144         ld      [$np+$j],$npj           ! np[j]
145         srlx    $car0,32,$car0
146         add     $acc0,$car1,$car1
147         add     $j,4,$j                 ! j++
148         mov     $tmp0,$acc0
149         st      $car1,[$tp]
150         cmp     $j,$num
151         mov     $tmp1,$acc1
152         srlx    $car1,32,$car1
153         bl      %icc,.L1st
154         add     $tp,4,$tp               ! tp++
155 !.L1st
156
157         mulx    $apj,$mul0,$tmp0        !epilogue!
158         mulx    $npj,$mul1,$tmp1
159         add     $acc0,$car0,$car0
160         and     $car0,$mask,$acc0
161         add     $acc1,$car1,$car1
162         srlx    $car0,32,$car0
163         add     $acc0,$car1,$car1
164         st      $car1,[$tp]
165         srlx    $car1,32,$car1
166
167         add     $tmp0,$car0,$car0
168         and     $car0,$mask,$acc0
169         add     $tmp1,$car1,$car1
170         srlx    $car0,32,$car0
171         add     $acc0,$car1,$car1
172         st      $car1,[$tp+4]
173         srlx    $car1,32,$car1
174
175         add     $car0,$car1,$car1
176         st      $car1,[$tp+8]
177         srlx    $car1,32,$car2
178 \f
179         mov     4,$i                    ! i++
180         ld      [$bp+4],$mul0           ! bp[1]
181 .Louter:
182         add     %sp,$bias+$frame,$tp
183         ld      [$ap],$car0             ! ap[0]
184         ld      [$ap+4],$apj            ! ap[1]
185         ld      [$np],$car1             ! np[0]
186         ld      [$np+4],$npj            ! np[1]
187         ld      [$tp],$tmp1             ! tp[0]
188         ld      [$tp+4],$tpj            ! tp[1]
189         mov     12,$j
190
191         mulx    $car0,$mul0,$car0
192         mulx    $apj,$mul0,$tmp0        !prologue!
193         add     $tmp1,$car0,$car0
194         ld      [$ap+8],$apj            !prologue!
195         and     $car0,$mask,$acc0
196
197         mulx    $n0,$acc0,$mul1
198         and     $mul1,$mask,$mul1
199
200         mulx    $car1,$mul1,$car1
201         mulx    $npj,$mul1,$acc1        !prologue!
202         srlx    $car0,32,$car0
203         add     $acc0,$car1,$car1
204         ld      [$np+8],$npj            !prologue!
205         srlx    $car1,32,$car1
206         mov     $tmp0,$acc0             !prologue!
207
208 .Linner:
209         mulx    $apj,$mul0,$tmp0
210         mulx    $npj,$mul1,$tmp1
211         add     $tpj,$car0,$car0
212         ld      [$ap+$j],$apj           ! ap[j]
213         add     $acc0,$car0,$car0
214         add     $acc1,$car1,$car1
215         ld      [$np+$j],$npj           ! np[j]
216         and     $car0,$mask,$acc0
217         ld      [$tp+8],$tpj            ! tp[j]
218         srlx    $car0,32,$car0
219         add     $acc0,$car1,$car1
220         add     $j,4,$j                 ! j++
221         mov     $tmp0,$acc0
222         st      $car1,[$tp]             ! tp[j-1]
223         srlx    $car1,32,$car1
224         mov     $tmp1,$acc1
225         cmp     $j,$num
226         bl      %icc,.Linner
227         add     $tp,4,$tp               ! tp++
228 !.Linner
229
230         mulx    $apj,$mul0,$tmp0        !epilogue!
231         mulx    $npj,$mul1,$tmp1
232         add     $tpj,$car0,$car0
233         add     $acc0,$car0,$car0
234         ld      [$tp+8],$tpj            ! tp[j]
235         and     $car0,$mask,$acc0
236         add     $acc1,$car1,$car1
237         srlx    $car0,32,$car0
238         add     $acc0,$car1,$car1
239         st      $car1,[$tp]             ! tp[j-1]
240         srlx    $car1,32,$car1
241
242         add     $tpj,$car0,$car0
243         add     $tmp0,$car0,$car0
244         and     $car0,$mask,$acc0
245         add     $tmp1,$car1,$car1
246         add     $acc0,$car1,$car1
247         st      $car1,[$tp+4]           ! tp[j-1]
248         srlx    $car0,32,$car0
249         add     $i,4,$i                 ! i++
250         srlx    $car1,32,$car1
251
252         add     $car0,$car1,$car1
253         cmp     $i,$num
254         add     $car2,$car1,$car1
255         st      $car1,[$tp+8]
256
257         srlx    $car1,32,$car2
258         bl,a    %icc,.Louter
259         ld      [$bp+$i],$mul0          ! bp[i]
260 !.Louter
261
262         add     $tp,12,$tp
263 \f
264 .Ltail:
265         add     $np,$num,$np
266         add     $rp,$num,$rp
267         sub     %g0,$num,%o7            ! k=-num
268         ba      .Lsub
269         subcc   %g0,%g0,%g0             ! clear %icc.c
270 .align  16
271 .Lsub:
272         ld      [$tp+%o7],%o0
273         ld      [$np+%o7],%o1
274         subccc  %o0,%o1,%o1             ! tp[j]-np[j]
275         add     $rp,%o7,$i
276         add     %o7,4,%o7
277         brnz    %o7,.Lsub
278         st      %o1,[$i]
279         subccc  $car2,0,$car2           ! handle upmost overflow bit
280         sub     %g0,$num,%o7
281
282 .Lcopy:
283         ld      [$tp+%o7],%o1           ! conditional copy
284         ld      [$rp+%o7],%o0
285         st      %g0,[$tp+%o7]           ! zap tp
286         movcs   %icc,%o1,%o0
287         st      %o0,[$rp+%o7]
288         add     %o7,4,%o7
289         brnz    %o7,.Lcopy
290         nop
291         mov     1,%i0
292         ret
293         restore
294 ___
295 \f
296 ########
297 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
298 ######## code without following dedicated squaring procedure.
299 ########
300 $sbit="%o5";
301
302 $code.=<<___;
303 .align  32
304 .Lbn_sqr_mont:
305         mulx    $mul0,$mul0,$car0               ! ap[0]*ap[0]
306         mulx    $apj,$mul0,$tmp0                !prologue!
307         and     $car0,$mask,$acc0
308         add     %sp,$bias+$frame,$tp
309         ld      [$ap+8],$apj                    !prologue!
310
311         mulx    $n0,$acc0,$mul1                 ! "t[0]"*n0
312         srlx    $car0,32,$car0
313         and     $mul1,$mask,$mul1
314
315         mulx    $car1,$mul1,$car1               ! np[0]*"t[0]"*n0
316         mulx    $npj,$mul1,$acc1                !prologue!
317         and     $car0,1,$sbit
318         ld      [$np+8],$npj                    !prologue!
319         srlx    $car0,1,$car0
320         add     $acc0,$car1,$car1
321         srlx    $car1,32,$car1
322         mov     $tmp0,$acc0                     !prologue!
323
324 .Lsqr_1st:
325         mulx    $apj,$mul0,$tmp0
326         mulx    $npj,$mul1,$tmp1
327         add     $acc0,$car0,$car0               ! ap[j]*a0+c0
328         add     $acc1,$car1,$car1
329         ld      [$ap+$j],$apj                   ! ap[j]
330         and     $car0,$mask,$acc0
331         ld      [$np+$j],$npj                   ! np[j]
332         srlx    $car0,32,$car0
333         add     $acc0,$acc0,$acc0
334         or      $sbit,$acc0,$acc0
335         mov     $tmp1,$acc1
336         srlx    $acc0,32,$sbit
337         add     $j,4,$j                         ! j++
338         and     $acc0,$mask,$acc0
339         cmp     $j,$num
340         add     $acc0,$car1,$car1
341         st      $car1,[$tp]
342         mov     $tmp0,$acc0
343         srlx    $car1,32,$car1
344         bl      %icc,.Lsqr_1st
345         add     $tp,4,$tp                       ! tp++
346 !.Lsqr_1st
347
348         mulx    $apj,$mul0,$tmp0                ! epilogue
349         mulx    $npj,$mul1,$tmp1
350         add     $acc0,$car0,$car0               ! ap[j]*a0+c0
351         add     $acc1,$car1,$car1
352         and     $car0,$mask,$acc0
353         srlx    $car0,32,$car0
354         add     $acc0,$acc0,$acc0
355         or      $sbit,$acc0,$acc0
356         srlx    $acc0,32,$sbit
357         and     $acc0,$mask,$acc0
358         add     $acc0,$car1,$car1
359         st      $car1,[$tp]
360         srlx    $car1,32,$car1
361
362         add     $tmp0,$car0,$car0               ! ap[j]*a0+c0
363         add     $tmp1,$car1,$car1
364         and     $car0,$mask,$acc0
365         srlx    $car0,32,$car0
366         add     $acc0,$acc0,$acc0
367         or      $sbit,$acc0,$acc0
368         srlx    $acc0,32,$sbit
369         and     $acc0,$mask,$acc0
370         add     $acc0,$car1,$car1
371         st      $car1,[$tp+4]
372         srlx    $car1,32,$car1
373
374         add     $car0,$car0,$car0
375         or      $sbit,$car0,$car0
376         add     $car0,$car1,$car1
377         st      $car1,[$tp+8]
378         srlx    $car1,32,$car2
379 \f
380         ld      [%sp+$bias+$frame],$tmp0        ! tp[0]
381         ld      [%sp+$bias+$frame+4],$tmp1      ! tp[1]
382         ld      [%sp+$bias+$frame+8],$tpj       ! tp[2]
383         ld      [$ap+4],$mul0                   ! ap[1]
384         ld      [$ap+8],$apj                    ! ap[2]
385         ld      [$np],$car1                     ! np[0]
386         ld      [$np+4],$npj                    ! np[1]
387         mulx    $n0,$tmp0,$mul1
388
389         mulx    $mul0,$mul0,$car0
390         and     $mul1,$mask,$mul1
391
392         mulx    $car1,$mul1,$car1
393         mulx    $npj,$mul1,$acc1
394         add     $tmp0,$car1,$car1
395         and     $car0,$mask,$acc0
396         ld      [$np+8],$npj                    ! np[2]
397         srlx    $car1,32,$car1
398         add     $tmp1,$car1,$car1
399         srlx    $car0,32,$car0
400         add     $acc0,$car1,$car1
401         and     $car0,1,$sbit
402         add     $acc1,$car1,$car1
403         srlx    $car0,1,$car0
404         mov     12,$j
405         st      $car1,[%sp+$bias+$frame]        ! tp[0]=
406         srlx    $car1,32,$car1
407         add     %sp,$bias+$frame+4,$tp
408
409 .Lsqr_2nd:
410         mulx    $apj,$mul0,$acc0
411         mulx    $npj,$mul1,$acc1
412         add     $acc0,$car0,$car0
413         add     $tpj,$sbit,$sbit
414         ld      [$ap+$j],$apj                   ! ap[j]
415         and     $car0,$mask,$acc0
416         ld      [$np+$j],$npj                   ! np[j]
417         srlx    $car0,32,$car0
418         add     $acc1,$car1,$car1
419         ld      [$tp+8],$tpj                    ! tp[j]
420         add     $acc0,$acc0,$acc0
421         add     $j,4,$j                         ! j++
422         add     $sbit,$acc0,$acc0
423         srlx    $acc0,32,$sbit
424         and     $acc0,$mask,$acc0
425         cmp     $j,$num
426         add     $acc0,$car1,$car1
427         st      $car1,[$tp]                     ! tp[j-1]
428         srlx    $car1,32,$car1
429         bl      %icc,.Lsqr_2nd
430         add     $tp,4,$tp                       ! tp++
431 !.Lsqr_2nd
432
433         mulx    $apj,$mul0,$acc0
434         mulx    $npj,$mul1,$acc1
435         add     $acc0,$car0,$car0
436         add     $tpj,$sbit,$sbit
437         and     $car0,$mask,$acc0
438         srlx    $car0,32,$car0
439         add     $acc1,$car1,$car1
440         add     $acc0,$acc0,$acc0
441         add     $sbit,$acc0,$acc0
442         srlx    $acc0,32,$sbit
443         and     $acc0,$mask,$acc0
444         add     $acc0,$car1,$car1
445         st      $car1,[$tp]                     ! tp[j-1]
446         srlx    $car1,32,$car1
447
448         add     $car0,$car0,$car0
449         add     $sbit,$car0,$car0
450         add     $car0,$car1,$car1
451         add     $car2,$car1,$car1
452         st      $car1,[$tp+4]
453         srlx    $car1,32,$car2
454 \f
455         ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
456         ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
457         ld      [$ap+8],$mul0                   ! ap[2]
458         ld      [$np],$car1                     ! np[0]
459         ld      [$np+4],$npj                    ! np[1]
460         mulx    $n0,$tmp1,$mul1
461         and     $mul1,$mask,$mul1
462         mov     8,$i
463
464         mulx    $mul0,$mul0,$car0
465         mulx    $car1,$mul1,$car1
466         and     $car0,$mask,$acc0
467         add     $tmp1,$car1,$car1
468         srlx    $car0,32,$car0
469         add     %sp,$bias+$frame,$tp
470         srlx    $car1,32,$car1
471         and     $car0,1,$sbit
472         srlx    $car0,1,$car0
473         mov     4,$j
474
475 .Lsqr_outer:
476 .Lsqr_inner1:
477         mulx    $npj,$mul1,$acc1
478         add     $tpj,$car1,$car1
479         add     $j,4,$j
480         ld      [$tp+8],$tpj
481         cmp     $j,$i
482         add     $acc1,$car1,$car1
483         ld      [$np+$j],$npj
484         st      $car1,[$tp]
485         srlx    $car1,32,$car1
486         bl      %icc,.Lsqr_inner1
487         add     $tp,4,$tp
488 !.Lsqr_inner1
489
490         add     $j,4,$j
491         ld      [$ap+$j],$apj                   ! ap[j]
492         mulx    $npj,$mul1,$acc1
493         add     $tpj,$car1,$car1
494         ld      [$np+$j],$npj                   ! np[j]
495         srlx    $car1,32,$tmp0
496         and     $car1,$mask,$car1
497         add     $tmp0,$sbit,$sbit
498         add     $acc0,$car1,$car1
499         ld      [$tp+8],$tpj                    ! tp[j]
500         add     $acc1,$car1,$car1
501         st      $car1,[$tp]
502         srlx    $car1,32,$car1
503
504         add     $j,4,$j
505         cmp     $j,$num
506         be,pn   %icc,.Lsqr_no_inner2
507         add     $tp,4,$tp
508
509 .Lsqr_inner2:
510         mulx    $apj,$mul0,$acc0
511         mulx    $npj,$mul1,$acc1
512         add     $tpj,$sbit,$sbit
513         add     $acc0,$car0,$car0
514         ld      [$ap+$j],$apj                   ! ap[j]
515         and     $car0,$mask,$acc0
516         ld      [$np+$j],$npj                   ! np[j]
517         srlx    $car0,32,$car0
518         add     $acc0,$acc0,$acc0
519         ld      [$tp+8],$tpj                    ! tp[j]
520         add     $sbit,$acc0,$acc0
521         add     $j,4,$j                         ! j++
522         srlx    $acc0,32,$sbit
523         and     $acc0,$mask,$acc0
524         cmp     $j,$num
525         add     $acc0,$car1,$car1
526         add     $acc1,$car1,$car1
527         st      $car1,[$tp]                     ! tp[j-1]
528         srlx    $car1,32,$car1
529         bl      %icc,.Lsqr_inner2
530         add     $tp,4,$tp                       ! tp++
531
532 .Lsqr_no_inner2:
533         mulx    $apj,$mul0,$acc0
534         mulx    $npj,$mul1,$acc1
535         add     $tpj,$sbit,$sbit
536         add     $acc0,$car0,$car0
537         and     $car0,$mask,$acc0
538         srlx    $car0,32,$car0
539         add     $acc0,$acc0,$acc0
540         add     $sbit,$acc0,$acc0
541         srlx    $acc0,32,$sbit
542         and     $acc0,$mask,$acc0
543         add     $acc0,$car1,$car1
544         add     $acc1,$car1,$car1
545         st      $car1,[$tp]                     ! tp[j-1]
546         srlx    $car1,32,$car1
547
548         add     $car0,$car0,$car0
549         add     $sbit,$car0,$car0
550         add     $car0,$car1,$car1
551         add     $car2,$car1,$car1
552         st      $car1,[$tp+4]
553         srlx    $car1,32,$car2
554 \f
555         add     $i,4,$i                         ! i++
556         ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
557         ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
558         ld      [$ap+$i],$mul0                  ! ap[j]
559         ld      [$np],$car1                     ! np[0]
560         ld      [$np+4],$npj                    ! np[1]
561         mulx    $n0,$tmp1,$mul1
562         and     $mul1,$mask,$mul1
563         add     $i,4,$tmp0
564
565         mulx    $mul0,$mul0,$car0
566         mulx    $car1,$mul1,$car1
567         and     $car0,$mask,$acc0
568         add     $tmp1,$car1,$car1
569         srlx    $car0,32,$car0
570         add     %sp,$bias+$frame,$tp
571         srlx    $car1,32,$car1
572         and     $car0,1,$sbit
573         srlx    $car0,1,$car0
574
575         cmp     $tmp0,$num                      ! i<num-1
576         bl      %icc,.Lsqr_outer
577         mov     4,$j
578 \f
579 .Lsqr_last:
580         mulx    $npj,$mul1,$acc1
581         add     $tpj,$car1,$car1
582         add     $j,4,$j
583         ld      [$tp+8],$tpj
584         cmp     $j,$i
585         add     $acc1,$car1,$car1
586         ld      [$np+$j],$npj
587         st      $car1,[$tp]
588         srlx    $car1,32,$car1
589         bl      %icc,.Lsqr_last
590         add     $tp,4,$tp
591 !.Lsqr_last
592
593         mulx    $npj,$mul1,$acc1
594         add     $tpj,$acc0,$acc0
595         srlx    $acc0,32,$tmp0
596         and     $acc0,$mask,$acc0
597         add     $tmp0,$sbit,$sbit
598         add     $acc0,$car1,$car1
599         add     $acc1,$car1,$car1
600         st      $car1,[$tp]
601         srlx    $car1,32,$car1
602
603         add     $car0,$car0,$car0               ! recover $car0
604         add     $sbit,$car0,$car0
605         add     $car0,$car1,$car1
606         add     $car2,$car1,$car1
607         st      $car1,[$tp+4]
608         srlx    $car1,32,$car2
609
610         ba      .Ltail
611         add     $tp,8,$tp
612 .type   $fname,#function
613 .size   $fname,(.-$fname)
614 .asciz  "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
615 .align  32
616 ___
617 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
618 print $code;
619 close STDOUT or die "error closing STDOUT";