bn/asm/*-mont.pl: harmonize with BN_from_montgomery_word.
[openssl.git] / crypto / bn / asm / sparcv9-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # December 2005
18 #
19 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
20 # for undertaken effort are multiple. First of all, UltraSPARC is not
21 # the whole SPARCv9 universe and other VIS-free implementations deserve
22 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
23 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
24 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
25 # several integrated RSA/DSA accelerator circuits accessible through
26 # kernel driver [only(*)], but having decent user-land software
27 # implementation is important too. Finally, reasons like desire to
28 # experiment with dedicated squaring procedure. Yes, this module
29 # implements one, because it was easiest to draft it in SPARCv9
30 # instructions...
31
32 # (*)   Engine accessing the driver in question is on my TODO list.
33 #       For reference, accelerator is estimated to give 6 to 10 times
34 #       improvement on single-threaded RSA sign. It should be noted
35 #       that 6-10x improvement coefficient does not actually mean
36 #       something extraordinary in terms of absolute [single-threaded]
37 #       performance, as SPARCv9 instruction set is by all means least
38 #       suitable for high performance crypto among other 64 bit
39 #       platforms. 6-10x factor simply places T1 in same performance
40 #       domain as say AMD64 and IA-64. Improvement of RSA verify don't
41 #       appear impressive at all, but it's the sign operation which is
42 #       far more critical/interesting.
43
44 # You might notice that inner loops are modulo-scheduled:-) This has
45 # essentially negligible impact on UltraSPARC performance, it's
46 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
47 # the advantage... Currently this module surpasses sparcv9a-mont.pl
48 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
49 # module still have hidden potential [see TODO list there], which is
50 # estimated to be larger than 20%...
51
52 $output = pop;
53 open STDOUT,">$output";
54
55 # int bn_mul_mont(
56 $rp="%i0";      # BN_ULONG *rp,
57 $ap="%i1";      # const BN_ULONG *ap,
58 $bp="%i2";      # const BN_ULONG *bp,
59 $np="%i3";      # const BN_ULONG *np,
60 $n0="%i4";      # const BN_ULONG *n0,
61 $num="%i5";     # int num);
62
63 $frame="STACK_FRAME";
64 $bias="STACK_BIAS";
65
66 $car0="%o0";
67 $car1="%o1";
68 $car2="%o2";    # 1 bit
69 $acc0="%o3";
70 $acc1="%o4";
71 $mask="%g1";    # 32 bits, what a waste...
72 $tmp0="%g4";
73 $tmp1="%g5";
74
75 $i="%l0";
76 $j="%l1";
77 $mul0="%l2";
78 $mul1="%l3";
79 $tp="%l4";
80 $apj="%l5";
81 $npj="%l6";
82 $tpj="%l7";
83
84 $fname="bn_mul_mont_int";
85
86 $code=<<___;
87 #include "sparc_arch.h"
88
89 .section        ".text",#alloc,#execinstr
90
91 .global $fname
92 .align  32
93 $fname:
94         cmp     %o5,4                   ! 128 bits minimum
95         bge,pt  %icc,.Lenter
96         sethi   %hi(0xffffffff),$mask
97         retl
98         clr     %o0
99 .align  32
100 .Lenter:
101         save    %sp,-$frame,%sp
102         sll     $num,2,$num             ! num*=4
103         or      $mask,%lo(0xffffffff),$mask
104         ld      [$n0],$n0
105         cmp     $ap,$bp
106         and     $num,$mask,$num
107         ld      [$bp],$mul0             ! bp[0]
108         nop
109
110         add     %sp,$bias,%o7           ! real top of stack
111         ld      [$ap],$car0             ! ap[0] ! redundant in squaring context
112         sub     %o7,$num,%o7
113         ld      [$ap+4],$apj            ! ap[1]
114         and     %o7,-1024,%o7
115         ld      [$np],$car1             ! np[0]
116         sub     %o7,$bias,%sp           ! alloca
117         ld      [$np+4],$npj            ! np[1]
118         be,pt   SIZE_T_CC,.Lbn_sqr_mont
119         mov     12,$j
120
121         mulx    $car0,$mul0,$car0       ! ap[0]*bp[0]
122         mulx    $apj,$mul0,$tmp0        !prologue! ap[1]*bp[0]
123         and     $car0,$mask,$acc0
124         add     %sp,$bias+$frame,$tp
125         ld      [$ap+8],$apj            !prologue!
126
127         mulx    $n0,$acc0,$mul1         ! "t[0]"*n0
128         and     $mul1,$mask,$mul1
129
130         mulx    $car1,$mul1,$car1       ! np[0]*"t[0]"*n0
131         mulx    $npj,$mul1,$acc1        !prologue! np[1]*"t[0]"*n0
132         srlx    $car0,32,$car0
133         add     $acc0,$car1,$car1
134         ld      [$np+8],$npj            !prologue!
135         srlx    $car1,32,$car1
136         mov     $tmp0,$acc0             !prologue!
137
138 .L1st:
139         mulx    $apj,$mul0,$tmp0
140         mulx    $npj,$mul1,$tmp1
141         add     $acc0,$car0,$car0
142         ld      [$ap+$j],$apj           ! ap[j]
143         and     $car0,$mask,$acc0
144         add     $acc1,$car1,$car1
145         ld      [$np+$j],$npj           ! np[j]
146         srlx    $car0,32,$car0
147         add     $acc0,$car1,$car1
148         add     $j,4,$j                 ! j++
149         mov     $tmp0,$acc0
150         st      $car1,[$tp]
151         cmp     $j,$num
152         mov     $tmp1,$acc1
153         srlx    $car1,32,$car1
154         bl      %icc,.L1st
155         add     $tp,4,$tp               ! tp++
156 !.L1st
157
158         mulx    $apj,$mul0,$tmp0        !epilogue!
159         mulx    $npj,$mul1,$tmp1
160         add     $acc0,$car0,$car0
161         and     $car0,$mask,$acc0
162         add     $acc1,$car1,$car1
163         srlx    $car0,32,$car0
164         add     $acc0,$car1,$car1
165         st      $car1,[$tp]
166         srlx    $car1,32,$car1
167
168         add     $tmp0,$car0,$car0
169         and     $car0,$mask,$acc0
170         add     $tmp1,$car1,$car1
171         srlx    $car0,32,$car0
172         add     $acc0,$car1,$car1
173         st      $car1,[$tp+4]
174         srlx    $car1,32,$car1
175
176         add     $car0,$car1,$car1
177         st      $car1,[$tp+8]
178         srlx    $car1,32,$car2
179 \f
180         mov     4,$i                    ! i++
181         ld      [$bp+4],$mul0           ! bp[1]
182 .Louter:
183         add     %sp,$bias+$frame,$tp
184         ld      [$ap],$car0             ! ap[0]
185         ld      [$ap+4],$apj            ! ap[1]
186         ld      [$np],$car1             ! np[0]
187         ld      [$np+4],$npj            ! np[1]
188         ld      [$tp],$tmp1             ! tp[0]
189         ld      [$tp+4],$tpj            ! tp[1]
190         mov     12,$j
191
192         mulx    $car0,$mul0,$car0
193         mulx    $apj,$mul0,$tmp0        !prologue!
194         add     $tmp1,$car0,$car0
195         ld      [$ap+8],$apj            !prologue!
196         and     $car0,$mask,$acc0
197
198         mulx    $n0,$acc0,$mul1
199         and     $mul1,$mask,$mul1
200
201         mulx    $car1,$mul1,$car1
202         mulx    $npj,$mul1,$acc1        !prologue!
203         srlx    $car0,32,$car0
204         add     $acc0,$car1,$car1
205         ld      [$np+8],$npj            !prologue!
206         srlx    $car1,32,$car1
207         mov     $tmp0,$acc0             !prologue!
208
209 .Linner:
210         mulx    $apj,$mul0,$tmp0
211         mulx    $npj,$mul1,$tmp1
212         add     $tpj,$car0,$car0
213         ld      [$ap+$j],$apj           ! ap[j]
214         add     $acc0,$car0,$car0
215         add     $acc1,$car1,$car1
216         ld      [$np+$j],$npj           ! np[j]
217         and     $car0,$mask,$acc0
218         ld      [$tp+8],$tpj            ! tp[j]
219         srlx    $car0,32,$car0
220         add     $acc0,$car1,$car1
221         add     $j,4,$j                 ! j++
222         mov     $tmp0,$acc0
223         st      $car1,[$tp]             ! tp[j-1]
224         srlx    $car1,32,$car1
225         mov     $tmp1,$acc1
226         cmp     $j,$num
227         bl      %icc,.Linner
228         add     $tp,4,$tp               ! tp++
229 !.Linner
230
231         mulx    $apj,$mul0,$tmp0        !epilogue!
232         mulx    $npj,$mul1,$tmp1
233         add     $tpj,$car0,$car0
234         add     $acc0,$car0,$car0
235         ld      [$tp+8],$tpj            ! tp[j]
236         and     $car0,$mask,$acc0
237         add     $acc1,$car1,$car1
238         srlx    $car0,32,$car0
239         add     $acc0,$car1,$car1
240         st      $car1,[$tp]             ! tp[j-1]
241         srlx    $car1,32,$car1
242
243         add     $tpj,$car0,$car0
244         add     $tmp0,$car0,$car0
245         and     $car0,$mask,$acc0
246         add     $tmp1,$car1,$car1
247         add     $acc0,$car1,$car1
248         st      $car1,[$tp+4]           ! tp[j-1]
249         srlx    $car0,32,$car0
250         add     $i,4,$i                 ! i++
251         srlx    $car1,32,$car1
252
253         add     $car0,$car1,$car1
254         cmp     $i,$num
255         add     $car2,$car1,$car1
256         st      $car1,[$tp+8]
257
258         srlx    $car1,32,$car2
259         bl,a    %icc,.Louter
260         ld      [$bp+$i],$mul0          ! bp[i]
261 !.Louter
262
263         add     $tp,12,$tp
264 \f
265 .Ltail:
266         add     $np,$num,$np
267         add     $rp,$num,$rp
268         sub     %g0,$num,%o7            ! k=-num
269         ba      .Lsub
270         subcc   %g0,%g0,%g0             ! clear %icc.c
271 .align  16
272 .Lsub:
273         ld      [$tp+%o7],%o0
274         ld      [$np+%o7],%o1
275         subccc  %o0,%o1,%o1             ! tp[j]-np[j]
276         add     $rp,%o7,$i
277         add     %o7,4,%o7
278         brnz    %o7,.Lsub
279         st      %o1,[$i]
280         subccc  $car2,0,$car2           ! handle upmost overflow bit
281         sub     %g0,$num,%o7
282
283 .Lcopy:
284         ld      [$tp+%o7],%o1           ! conditional copy
285         ld      [$rp+%o7],%o0
286         st      %g0,[$tp+%o7]           ! zap tp
287         movcs   %icc,%o1,%o0
288         st      %o0,[$rp+%o7]
289         add     %o7,4,%o7
290         brnz    %o7,.Lcopy
291         nop
292         mov     1,%i0
293         ret
294         restore
295 ___
296 \f
297 ########
298 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
299 ######## code without following dedicated squaring procedure.
300 ########
301 $sbit="%o5";
302
303 $code.=<<___;
304 .align  32
305 .Lbn_sqr_mont:
306         mulx    $mul0,$mul0,$car0               ! ap[0]*ap[0]
307         mulx    $apj,$mul0,$tmp0                !prologue!
308         and     $car0,$mask,$acc0
309         add     %sp,$bias+$frame,$tp
310         ld      [$ap+8],$apj                    !prologue!
311
312         mulx    $n0,$acc0,$mul1                 ! "t[0]"*n0
313         srlx    $car0,32,$car0
314         and     $mul1,$mask,$mul1
315
316         mulx    $car1,$mul1,$car1               ! np[0]*"t[0]"*n0
317         mulx    $npj,$mul1,$acc1                !prologue!
318         and     $car0,1,$sbit
319         ld      [$np+8],$npj                    !prologue!
320         srlx    $car0,1,$car0
321         add     $acc0,$car1,$car1
322         srlx    $car1,32,$car1
323         mov     $tmp0,$acc0                     !prologue!
324
325 .Lsqr_1st:
326         mulx    $apj,$mul0,$tmp0
327         mulx    $npj,$mul1,$tmp1
328         add     $acc0,$car0,$car0               ! ap[j]*a0+c0
329         add     $acc1,$car1,$car1
330         ld      [$ap+$j],$apj                   ! ap[j]
331         and     $car0,$mask,$acc0
332         ld      [$np+$j],$npj                   ! np[j]
333         srlx    $car0,32,$car0
334         add     $acc0,$acc0,$acc0
335         or      $sbit,$acc0,$acc0
336         mov     $tmp1,$acc1
337         srlx    $acc0,32,$sbit
338         add     $j,4,$j                         ! j++
339         and     $acc0,$mask,$acc0
340         cmp     $j,$num
341         add     $acc0,$car1,$car1
342         st      $car1,[$tp]
343         mov     $tmp0,$acc0
344         srlx    $car1,32,$car1
345         bl      %icc,.Lsqr_1st
346         add     $tp,4,$tp                       ! tp++
347 !.Lsqr_1st
348
349         mulx    $apj,$mul0,$tmp0                ! epilogue
350         mulx    $npj,$mul1,$tmp1
351         add     $acc0,$car0,$car0               ! ap[j]*a0+c0
352         add     $acc1,$car1,$car1
353         and     $car0,$mask,$acc0
354         srlx    $car0,32,$car0
355         add     $acc0,$acc0,$acc0
356         or      $sbit,$acc0,$acc0
357         srlx    $acc0,32,$sbit
358         and     $acc0,$mask,$acc0
359         add     $acc0,$car1,$car1
360         st      $car1,[$tp]
361         srlx    $car1,32,$car1
362
363         add     $tmp0,$car0,$car0               ! ap[j]*a0+c0
364         add     $tmp1,$car1,$car1
365         and     $car0,$mask,$acc0
366         srlx    $car0,32,$car0
367         add     $acc0,$acc0,$acc0
368         or      $sbit,$acc0,$acc0
369         srlx    $acc0,32,$sbit
370         and     $acc0,$mask,$acc0
371         add     $acc0,$car1,$car1
372         st      $car1,[$tp+4]
373         srlx    $car1,32,$car1
374
375         add     $car0,$car0,$car0
376         or      $sbit,$car0,$car0
377         add     $car0,$car1,$car1
378         st      $car1,[$tp+8]
379         srlx    $car1,32,$car2
380 \f
381         ld      [%sp+$bias+$frame],$tmp0        ! tp[0]
382         ld      [%sp+$bias+$frame+4],$tmp1      ! tp[1]
383         ld      [%sp+$bias+$frame+8],$tpj       ! tp[2]
384         ld      [$ap+4],$mul0                   ! ap[1]
385         ld      [$ap+8],$apj                    ! ap[2]
386         ld      [$np],$car1                     ! np[0]
387         ld      [$np+4],$npj                    ! np[1]
388         mulx    $n0,$tmp0,$mul1
389
390         mulx    $mul0,$mul0,$car0
391         and     $mul1,$mask,$mul1
392
393         mulx    $car1,$mul1,$car1
394         mulx    $npj,$mul1,$acc1
395         add     $tmp0,$car1,$car1
396         and     $car0,$mask,$acc0
397         ld      [$np+8],$npj                    ! np[2]
398         srlx    $car1,32,$car1
399         add     $tmp1,$car1,$car1
400         srlx    $car0,32,$car0
401         add     $acc0,$car1,$car1
402         and     $car0,1,$sbit
403         add     $acc1,$car1,$car1
404         srlx    $car0,1,$car0
405         mov     12,$j
406         st      $car1,[%sp+$bias+$frame]        ! tp[0]=
407         srlx    $car1,32,$car1
408         add     %sp,$bias+$frame+4,$tp
409
410 .Lsqr_2nd:
411         mulx    $apj,$mul0,$acc0
412         mulx    $npj,$mul1,$acc1
413         add     $acc0,$car0,$car0
414         add     $tpj,$sbit,$sbit
415         ld      [$ap+$j],$apj                   ! ap[j]
416         and     $car0,$mask,$acc0
417         ld      [$np+$j],$npj                   ! np[j]
418         srlx    $car0,32,$car0
419         add     $acc1,$car1,$car1
420         ld      [$tp+8],$tpj                    ! tp[j]
421         add     $acc0,$acc0,$acc0
422         add     $j,4,$j                         ! j++
423         add     $sbit,$acc0,$acc0
424         srlx    $acc0,32,$sbit
425         and     $acc0,$mask,$acc0
426         cmp     $j,$num
427         add     $acc0,$car1,$car1
428         st      $car1,[$tp]                     ! tp[j-1]
429         srlx    $car1,32,$car1
430         bl      %icc,.Lsqr_2nd
431         add     $tp,4,$tp                       ! tp++
432 !.Lsqr_2nd
433
434         mulx    $apj,$mul0,$acc0
435         mulx    $npj,$mul1,$acc1
436         add     $acc0,$car0,$car0
437         add     $tpj,$sbit,$sbit
438         and     $car0,$mask,$acc0
439         srlx    $car0,32,$car0
440         add     $acc1,$car1,$car1
441         add     $acc0,$acc0,$acc0
442         add     $sbit,$acc0,$acc0
443         srlx    $acc0,32,$sbit
444         and     $acc0,$mask,$acc0
445         add     $acc0,$car1,$car1
446         st      $car1,[$tp]                     ! tp[j-1]
447         srlx    $car1,32,$car1
448
449         add     $car0,$car0,$car0
450         add     $sbit,$car0,$car0
451         add     $car0,$car1,$car1
452         add     $car2,$car1,$car1
453         st      $car1,[$tp+4]
454         srlx    $car1,32,$car2
455 \f
456         ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
457         ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
458         ld      [$ap+8],$mul0                   ! ap[2]
459         ld      [$np],$car1                     ! np[0]
460         ld      [$np+4],$npj                    ! np[1]
461         mulx    $n0,$tmp1,$mul1
462         and     $mul1,$mask,$mul1
463         mov     8,$i
464
465         mulx    $mul0,$mul0,$car0
466         mulx    $car1,$mul1,$car1
467         and     $car0,$mask,$acc0
468         add     $tmp1,$car1,$car1
469         srlx    $car0,32,$car0
470         add     %sp,$bias+$frame,$tp
471         srlx    $car1,32,$car1
472         and     $car0,1,$sbit
473         srlx    $car0,1,$car0
474         mov     4,$j
475
476 .Lsqr_outer:
477 .Lsqr_inner1:
478         mulx    $npj,$mul1,$acc1
479         add     $tpj,$car1,$car1
480         add     $j,4,$j
481         ld      [$tp+8],$tpj
482         cmp     $j,$i
483         add     $acc1,$car1,$car1
484         ld      [$np+$j],$npj
485         st      $car1,[$tp]
486         srlx    $car1,32,$car1
487         bl      %icc,.Lsqr_inner1
488         add     $tp,4,$tp
489 !.Lsqr_inner1
490
491         add     $j,4,$j
492         ld      [$ap+$j],$apj                   ! ap[j]
493         mulx    $npj,$mul1,$acc1
494         add     $tpj,$car1,$car1
495         ld      [$np+$j],$npj                   ! np[j]
496         add     $acc0,$car1,$car1
497         ld      [$tp+8],$tpj                    ! tp[j]
498         add     $acc1,$car1,$car1
499         st      $car1,[$tp]
500         srlx    $car1,32,$car1
501
502         add     $j,4,$j
503         cmp     $j,$num
504         be,pn   %icc,.Lsqr_no_inner2
505         add     $tp,4,$tp
506
507 .Lsqr_inner2:
508         mulx    $apj,$mul0,$acc0
509         mulx    $npj,$mul1,$acc1
510         add     $tpj,$sbit,$sbit
511         add     $acc0,$car0,$car0
512         ld      [$ap+$j],$apj                   ! ap[j]
513         and     $car0,$mask,$acc0
514         ld      [$np+$j],$npj                   ! np[j]
515         srlx    $car0,32,$car0
516         add     $acc0,$acc0,$acc0
517         ld      [$tp+8],$tpj                    ! tp[j]
518         add     $sbit,$acc0,$acc0
519         add     $j,4,$j                         ! j++
520         srlx    $acc0,32,$sbit
521         and     $acc0,$mask,$acc0
522         cmp     $j,$num
523         add     $acc0,$car1,$car1
524         add     $acc1,$car1,$car1
525         st      $car1,[$tp]                     ! tp[j-1]
526         srlx    $car1,32,$car1
527         bl      %icc,.Lsqr_inner2
528         add     $tp,4,$tp                       ! tp++
529
530 .Lsqr_no_inner2:
531         mulx    $apj,$mul0,$acc0
532         mulx    $npj,$mul1,$acc1
533         add     $tpj,$sbit,$sbit
534         add     $acc0,$car0,$car0
535         and     $car0,$mask,$acc0
536         srlx    $car0,32,$car0
537         add     $acc0,$acc0,$acc0
538         add     $sbit,$acc0,$acc0
539         srlx    $acc0,32,$sbit
540         and     $acc0,$mask,$acc0
541         add     $acc0,$car1,$car1
542         add     $acc1,$car1,$car1
543         st      $car1,[$tp]                     ! tp[j-1]
544         srlx    $car1,32,$car1
545
546         add     $car0,$car0,$car0
547         add     $sbit,$car0,$car0
548         add     $car0,$car1,$car1
549         add     $car2,$car1,$car1
550         st      $car1,[$tp+4]
551         srlx    $car1,32,$car2
552 \f
553         add     $i,4,$i                         ! i++
554         ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
555         ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
556         ld      [$ap+$i],$mul0                  ! ap[j]
557         ld      [$np],$car1                     ! np[0]
558         ld      [$np+4],$npj                    ! np[1]
559         mulx    $n0,$tmp1,$mul1
560         and     $mul1,$mask,$mul1
561         add     $i,4,$tmp0
562
563         mulx    $mul0,$mul0,$car0
564         mulx    $car1,$mul1,$car1
565         and     $car0,$mask,$acc0
566         add     $tmp1,$car1,$car1
567         srlx    $car0,32,$car0
568         add     %sp,$bias+$frame,$tp
569         srlx    $car1,32,$car1
570         and     $car0,1,$sbit
571         srlx    $car0,1,$car0
572
573         cmp     $tmp0,$num                      ! i<num-1
574         bl      %icc,.Lsqr_outer
575         mov     4,$j
576 \f
577 .Lsqr_last:
578         mulx    $npj,$mul1,$acc1
579         add     $tpj,$car1,$car1
580         add     $j,4,$j
581         ld      [$tp+8],$tpj
582         cmp     $j,$i
583         add     $acc1,$car1,$car1
584         ld      [$np+$j],$npj
585         st      $car1,[$tp]
586         srlx    $car1,32,$car1
587         bl      %icc,.Lsqr_last
588         add     $tp,4,$tp
589 !.Lsqr_last
590
591         mulx    $npj,$mul1,$acc1
592         add     $tpj,$acc0,$acc0
593         srlx    $acc0,32,$tmp0
594         and     $acc0,$mask,$acc0
595         add     $tmp0,$sbit,$sbit
596         add     $acc0,$car1,$car1
597         add     $acc1,$car1,$car1
598         st      $car1,[$tp]
599         srlx    $car1,32,$car1
600
601         add     $car0,$car0,$car0               ! recover $car0
602         add     $sbit,$car0,$car0
603         add     $car0,$car1,$car1
604         add     $car2,$car1,$car1
605         st      $car1,[$tp+4]
606         srlx    $car1,32,$car2
607
608         ba      .Ltail
609         add     $tp,8,$tp
610 .type   $fname,#function
611 .size   $fname,(.-$fname)
612 .asciz  "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
613 .align  32
614 ___
615 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
616 print $code;
617 close STDOUT;