Raise an error on syscall failure in tls_retry_write_records
[openssl.git] / crypto / bn / asm / sparcv9-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # December 2005
18 #
19 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
20 # for undertaken effort are multiple. First of all, UltraSPARC is not
21 # the whole SPARCv9 universe and other VIS-free implementations deserve
22 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
23 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
24 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
25 # several integrated RSA/DSA accelerator circuits accessible through
26 # kernel driver [only(*)], but having decent user-land software
27 # implementation is important too. Finally, reasons like desire to
28 # experiment with dedicated squaring procedure. Yes, this module
29 # implements one, because it was easiest to draft it in SPARCv9
30 # instructions...
31
32 # (*)   Engine accessing the driver in question is on my TODO list.
33 #       For reference, accelerator is estimated to give 6 to 10 times
34 #       improvement on single-threaded RSA sign. It should be noted
35 #       that 6-10x improvement coefficient does not actually mean
36 #       something extraordinary in terms of absolute [single-threaded]
37 #       performance, as SPARCv9 instruction set is by all means least
38 #       suitable for high performance crypto among other 64 bit
39 #       platforms. 6-10x factor simply places T1 in same performance
40 #       domain as say AMD64 and IA-64. Improvement of RSA verify don't
41 #       appear impressive at all, but it's the sign operation which is
42 #       far more critical/interesting.
43
44 # You might notice that inner loops are modulo-scheduled:-) This has
45 # essentially negligible impact on UltraSPARC performance, it's
46 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
47 # the advantage... Currently this module surpasses sparcv9a-mont.pl
48 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
49 # module still have hidden potential [see TODO list there], which is
50 # estimated to be larger than 20%...
51
52 $output = pop and open STDOUT,">$output";
53
54 # int bn_mul_mont(
55 $rp="%i0";      # BN_ULONG *rp,
56 $ap="%i1";      # const BN_ULONG *ap,
57 $bp="%i2";      # const BN_ULONG *bp,
58 $np="%i3";      # const BN_ULONG *np,
59 $n0="%i4";      # const BN_ULONG *n0,
60 $num="%i5";     # int num);
61
62 $frame="STACK_FRAME";
63 $bias="STACK_BIAS";
64
65 $car0="%o0";
66 $car1="%o1";
67 $car2="%o2";    # 1 bit
68 $acc0="%o3";
69 $acc1="%o4";
70 $mask="%g1";    # 32 bits, what a waste...
71 $tmp0="%g4";
72 $tmp1="%g5";
73
74 $i="%l0";
75 $j="%l1";
76 $mul0="%l2";
77 $mul1="%l3";
78 $tp="%l4";
79 $apj="%l5";
80 $npj="%l6";
81 $tpj="%l7";
82
83 $fname="bn_mul_mont_int";
84
85 $code=<<___;
86 #ifndef __ASSEMBLER__
87 # define __ASSEMBLER__ 1
88 #endif
89 #include "crypto/sparc_arch.h"
90
91 .section        ".text",#alloc,#execinstr
92
93 .global $fname
94 .align  32
95 $fname:
96         cmp     %o5,4                   ! 128 bits minimum
97         bge,pt  %icc,.Lenter
98         sethi   %hi(0xffffffff),$mask
99         retl
100         clr     %o0
101 .align  32
102 .Lenter:
103         save    %sp,-$frame,%sp
104         sll     $num,2,$num             ! num*=4
105         or      $mask,%lo(0xffffffff),$mask
106         ld      [$n0],$n0
107         cmp     $ap,$bp
108         and     $num,$mask,$num
109         ld      [$bp],$mul0             ! bp[0]
110         nop
111
112         add     %sp,$bias,%o7           ! real top of stack
113         ld      [$ap],$car0             ! ap[0] ! redundant in squaring context
114         sub     %o7,$num,%o7
115         ld      [$ap+4],$apj            ! ap[1]
116         and     %o7,-1024,%o7
117         ld      [$np],$car1             ! np[0]
118         sub     %o7,$bias,%sp           ! alloca
119         ld      [$np+4],$npj            ! np[1]
120         be,pt   SIZE_T_CC,.Lbn_sqr_mont
121         mov     12,$j
122
123         mulx    $car0,$mul0,$car0       ! ap[0]*bp[0]
124         mulx    $apj,$mul0,$tmp0        !prologue! ap[1]*bp[0]
125         and     $car0,$mask,$acc0
126         add     %sp,$bias+$frame,$tp
127         ld      [$ap+8],$apj            !prologue!
128
129         mulx    $n0,$acc0,$mul1         ! "t[0]"*n0
130         and     $mul1,$mask,$mul1
131
132         mulx    $car1,$mul1,$car1       ! np[0]*"t[0]"*n0
133         mulx    $npj,$mul1,$acc1        !prologue! np[1]*"t[0]"*n0
134         srlx    $car0,32,$car0
135         add     $acc0,$car1,$car1
136         ld      [$np+8],$npj            !prologue!
137         srlx    $car1,32,$car1
138         mov     $tmp0,$acc0             !prologue!
139
140 .L1st:
141         mulx    $apj,$mul0,$tmp0
142         mulx    $npj,$mul1,$tmp1
143         add     $acc0,$car0,$car0
144         ld      [$ap+$j],$apj           ! ap[j]
145         and     $car0,$mask,$acc0
146         add     $acc1,$car1,$car1
147         ld      [$np+$j],$npj           ! np[j]
148         srlx    $car0,32,$car0
149         add     $acc0,$car1,$car1
150         add     $j,4,$j                 ! j++
151         mov     $tmp0,$acc0
152         st      $car1,[$tp]
153         cmp     $j,$num
154         mov     $tmp1,$acc1
155         srlx    $car1,32,$car1
156         bl      %icc,.L1st
157         add     $tp,4,$tp               ! tp++
158 !.L1st
159
160         mulx    $apj,$mul0,$tmp0        !epilogue!
161         mulx    $npj,$mul1,$tmp1
162         add     $acc0,$car0,$car0
163         and     $car0,$mask,$acc0
164         add     $acc1,$car1,$car1
165         srlx    $car0,32,$car0
166         add     $acc0,$car1,$car1
167         st      $car1,[$tp]
168         srlx    $car1,32,$car1
169
170         add     $tmp0,$car0,$car0
171         and     $car0,$mask,$acc0
172         add     $tmp1,$car1,$car1
173         srlx    $car0,32,$car0
174         add     $acc0,$car1,$car1
175         st      $car1,[$tp+4]
176         srlx    $car1,32,$car1
177
178         add     $car0,$car1,$car1
179         st      $car1,[$tp+8]
180         srlx    $car1,32,$car2
181 \f
182         mov     4,$i                    ! i++
183         ld      [$bp+4],$mul0           ! bp[1]
184 .Louter:
185         add     %sp,$bias+$frame,$tp
186         ld      [$ap],$car0             ! ap[0]
187         ld      [$ap+4],$apj            ! ap[1]
188         ld      [$np],$car1             ! np[0]
189         ld      [$np+4],$npj            ! np[1]
190         ld      [$tp],$tmp1             ! tp[0]
191         ld      [$tp+4],$tpj            ! tp[1]
192         mov     12,$j
193
194         mulx    $car0,$mul0,$car0
195         mulx    $apj,$mul0,$tmp0        !prologue!
196         add     $tmp1,$car0,$car0
197         ld      [$ap+8],$apj            !prologue!
198         and     $car0,$mask,$acc0
199
200         mulx    $n0,$acc0,$mul1
201         and     $mul1,$mask,$mul1
202
203         mulx    $car1,$mul1,$car1
204         mulx    $npj,$mul1,$acc1        !prologue!
205         srlx    $car0,32,$car0
206         add     $acc0,$car1,$car1
207         ld      [$np+8],$npj            !prologue!
208         srlx    $car1,32,$car1
209         mov     $tmp0,$acc0             !prologue!
210
211 .Linner:
212         mulx    $apj,$mul0,$tmp0
213         mulx    $npj,$mul1,$tmp1
214         add     $tpj,$car0,$car0
215         ld      [$ap+$j],$apj           ! ap[j]
216         add     $acc0,$car0,$car0
217         add     $acc1,$car1,$car1
218         ld      [$np+$j],$npj           ! np[j]
219         and     $car0,$mask,$acc0
220         ld      [$tp+8],$tpj            ! tp[j]
221         srlx    $car0,32,$car0
222         add     $acc0,$car1,$car1
223         add     $j,4,$j                 ! j++
224         mov     $tmp0,$acc0
225         st      $car1,[$tp]             ! tp[j-1]
226         srlx    $car1,32,$car1
227         mov     $tmp1,$acc1
228         cmp     $j,$num
229         bl      %icc,.Linner
230         add     $tp,4,$tp               ! tp++
231 !.Linner
232
233         mulx    $apj,$mul0,$tmp0        !epilogue!
234         mulx    $npj,$mul1,$tmp1
235         add     $tpj,$car0,$car0
236         add     $acc0,$car0,$car0
237         ld      [$tp+8],$tpj            ! tp[j]
238         and     $car0,$mask,$acc0
239         add     $acc1,$car1,$car1
240         srlx    $car0,32,$car0
241         add     $acc0,$car1,$car1
242         st      $car1,[$tp]             ! tp[j-1]
243         srlx    $car1,32,$car1
244
245         add     $tpj,$car0,$car0
246         add     $tmp0,$car0,$car0
247         and     $car0,$mask,$acc0
248         add     $tmp1,$car1,$car1
249         add     $acc0,$car1,$car1
250         st      $car1,[$tp+4]           ! tp[j-1]
251         srlx    $car0,32,$car0
252         add     $i,4,$i                 ! i++
253         srlx    $car1,32,$car1
254
255         add     $car0,$car1,$car1
256         cmp     $i,$num
257         add     $car2,$car1,$car1
258         st      $car1,[$tp+8]
259
260         srlx    $car1,32,$car2
261         bl,a    %icc,.Louter
262         ld      [$bp+$i],$mul0          ! bp[i]
263 !.Louter
264
265         add     $tp,12,$tp
266 \f
267 .Ltail:
268         add     $np,$num,$np
269         add     $rp,$num,$rp
270         sub     %g0,$num,%o7            ! k=-num
271         ba      .Lsub
272         subcc   %g0,%g0,%g0             ! clear %icc.c
273 .align  16
274 .Lsub:
275         ld      [$tp+%o7],%o0
276         ld      [$np+%o7],%o1
277         subccc  %o0,%o1,%o1             ! tp[j]-np[j]
278         add     $rp,%o7,$i
279         add     %o7,4,%o7
280         brnz    %o7,.Lsub
281         st      %o1,[$i]
282         subccc  $car2,0,$car2           ! handle upmost overflow bit
283         sub     %g0,$num,%o7
284
285 .Lcopy:
286         ld      [$tp+%o7],%o1           ! conditional copy
287         ld      [$rp+%o7],%o0
288         st      %g0,[$tp+%o7]           ! zap tp
289         movcs   %icc,%o1,%o0
290         st      %o0,[$rp+%o7]
291         add     %o7,4,%o7
292         brnz    %o7,.Lcopy
293         nop
294         mov     1,%i0
295         ret
296         restore
297 ___
298 \f
299 ########
300 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
301 ######## code without following dedicated squaring procedure.
302 ########
303 $sbit="%o5";
304
305 $code.=<<___;
306 .align  32
307 .Lbn_sqr_mont:
308         mulx    $mul0,$mul0,$car0               ! ap[0]*ap[0]
309         mulx    $apj,$mul0,$tmp0                !prologue!
310         and     $car0,$mask,$acc0
311         add     %sp,$bias+$frame,$tp
312         ld      [$ap+8],$apj                    !prologue!
313
314         mulx    $n0,$acc0,$mul1                 ! "t[0]"*n0
315         srlx    $car0,32,$car0
316         and     $mul1,$mask,$mul1
317
318         mulx    $car1,$mul1,$car1               ! np[0]*"t[0]"*n0
319         mulx    $npj,$mul1,$acc1                !prologue!
320         and     $car0,1,$sbit
321         ld      [$np+8],$npj                    !prologue!
322         srlx    $car0,1,$car0
323         add     $acc0,$car1,$car1
324         srlx    $car1,32,$car1
325         mov     $tmp0,$acc0                     !prologue!
326
327 .Lsqr_1st:
328         mulx    $apj,$mul0,$tmp0
329         mulx    $npj,$mul1,$tmp1
330         add     $acc0,$car0,$car0               ! ap[j]*a0+c0
331         add     $acc1,$car1,$car1
332         ld      [$ap+$j],$apj                   ! ap[j]
333         and     $car0,$mask,$acc0
334         ld      [$np+$j],$npj                   ! np[j]
335         srlx    $car0,32,$car0
336         add     $acc0,$acc0,$acc0
337         or      $sbit,$acc0,$acc0
338         mov     $tmp1,$acc1
339         srlx    $acc0,32,$sbit
340         add     $j,4,$j                         ! j++
341         and     $acc0,$mask,$acc0
342         cmp     $j,$num
343         add     $acc0,$car1,$car1
344         st      $car1,[$tp]
345         mov     $tmp0,$acc0
346         srlx    $car1,32,$car1
347         bl      %icc,.Lsqr_1st
348         add     $tp,4,$tp                       ! tp++
349 !.Lsqr_1st
350
351         mulx    $apj,$mul0,$tmp0                ! epilogue
352         mulx    $npj,$mul1,$tmp1
353         add     $acc0,$car0,$car0               ! ap[j]*a0+c0
354         add     $acc1,$car1,$car1
355         and     $car0,$mask,$acc0
356         srlx    $car0,32,$car0
357         add     $acc0,$acc0,$acc0
358         or      $sbit,$acc0,$acc0
359         srlx    $acc0,32,$sbit
360         and     $acc0,$mask,$acc0
361         add     $acc0,$car1,$car1
362         st      $car1,[$tp]
363         srlx    $car1,32,$car1
364
365         add     $tmp0,$car0,$car0               ! ap[j]*a0+c0
366         add     $tmp1,$car1,$car1
367         and     $car0,$mask,$acc0
368         srlx    $car0,32,$car0
369         add     $acc0,$acc0,$acc0
370         or      $sbit,$acc0,$acc0
371         srlx    $acc0,32,$sbit
372         and     $acc0,$mask,$acc0
373         add     $acc0,$car1,$car1
374         st      $car1,[$tp+4]
375         srlx    $car1,32,$car1
376
377         add     $car0,$car0,$car0
378         or      $sbit,$car0,$car0
379         add     $car0,$car1,$car1
380         st      $car1,[$tp+8]
381         srlx    $car1,32,$car2
382 \f
383         ld      [%sp+$bias+$frame],$tmp0        ! tp[0]
384         ld      [%sp+$bias+$frame+4],$tmp1      ! tp[1]
385         ld      [%sp+$bias+$frame+8],$tpj       ! tp[2]
386         ld      [$ap+4],$mul0                   ! ap[1]
387         ld      [$ap+8],$apj                    ! ap[2]
388         ld      [$np],$car1                     ! np[0]
389         ld      [$np+4],$npj                    ! np[1]
390         mulx    $n0,$tmp0,$mul1
391
392         mulx    $mul0,$mul0,$car0
393         and     $mul1,$mask,$mul1
394
395         mulx    $car1,$mul1,$car1
396         mulx    $npj,$mul1,$acc1
397         add     $tmp0,$car1,$car1
398         and     $car0,$mask,$acc0
399         ld      [$np+8],$npj                    ! np[2]
400         srlx    $car1,32,$car1
401         add     $tmp1,$car1,$car1
402         srlx    $car0,32,$car0
403         add     $acc0,$car1,$car1
404         and     $car0,1,$sbit
405         add     $acc1,$car1,$car1
406         srlx    $car0,1,$car0
407         mov     12,$j
408         st      $car1,[%sp+$bias+$frame]        ! tp[0]=
409         srlx    $car1,32,$car1
410         add     %sp,$bias+$frame+4,$tp
411
412 .Lsqr_2nd:
413         mulx    $apj,$mul0,$acc0
414         mulx    $npj,$mul1,$acc1
415         add     $acc0,$car0,$car0
416         add     $tpj,$sbit,$sbit
417         ld      [$ap+$j],$apj                   ! ap[j]
418         and     $car0,$mask,$acc0
419         ld      [$np+$j],$npj                   ! np[j]
420         srlx    $car0,32,$car0
421         add     $acc1,$car1,$car1
422         ld      [$tp+8],$tpj                    ! tp[j]
423         add     $acc0,$acc0,$acc0
424         add     $j,4,$j                         ! j++
425         add     $sbit,$acc0,$acc0
426         srlx    $acc0,32,$sbit
427         and     $acc0,$mask,$acc0
428         cmp     $j,$num
429         add     $acc0,$car1,$car1
430         st      $car1,[$tp]                     ! tp[j-1]
431         srlx    $car1,32,$car1
432         bl      %icc,.Lsqr_2nd
433         add     $tp,4,$tp                       ! tp++
434 !.Lsqr_2nd
435
436         mulx    $apj,$mul0,$acc0
437         mulx    $npj,$mul1,$acc1
438         add     $acc0,$car0,$car0
439         add     $tpj,$sbit,$sbit
440         and     $car0,$mask,$acc0
441         srlx    $car0,32,$car0
442         add     $acc1,$car1,$car1
443         add     $acc0,$acc0,$acc0
444         add     $sbit,$acc0,$acc0
445         srlx    $acc0,32,$sbit
446         and     $acc0,$mask,$acc0
447         add     $acc0,$car1,$car1
448         st      $car1,[$tp]                     ! tp[j-1]
449         srlx    $car1,32,$car1
450
451         add     $car0,$car0,$car0
452         add     $sbit,$car0,$car0
453         add     $car0,$car1,$car1
454         add     $car2,$car1,$car1
455         st      $car1,[$tp+4]
456         srlx    $car1,32,$car2
457 \f
458         ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
459         ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
460         ld      [$ap+8],$mul0                   ! ap[2]
461         ld      [$np],$car1                     ! np[0]
462         ld      [$np+4],$npj                    ! np[1]
463         mulx    $n0,$tmp1,$mul1
464         and     $mul1,$mask,$mul1
465         mov     8,$i
466
467         mulx    $mul0,$mul0,$car0
468         mulx    $car1,$mul1,$car1
469         and     $car0,$mask,$acc0
470         add     $tmp1,$car1,$car1
471         srlx    $car0,32,$car0
472         add     %sp,$bias+$frame,$tp
473         srlx    $car1,32,$car1
474         and     $car0,1,$sbit
475         srlx    $car0,1,$car0
476         mov     4,$j
477
478 .Lsqr_outer:
479 .Lsqr_inner1:
480         mulx    $npj,$mul1,$acc1
481         add     $tpj,$car1,$car1
482         add     $j,4,$j
483         ld      [$tp+8],$tpj
484         cmp     $j,$i
485         add     $acc1,$car1,$car1
486         ld      [$np+$j],$npj
487         st      $car1,[$tp]
488         srlx    $car1,32,$car1
489         bl      %icc,.Lsqr_inner1
490         add     $tp,4,$tp
491 !.Lsqr_inner1
492
493         add     $j,4,$j
494         ld      [$ap+$j],$apj                   ! ap[j]
495         mulx    $npj,$mul1,$acc1
496         add     $tpj,$car1,$car1
497         ld      [$np+$j],$npj                   ! np[j]
498         srlx    $car1,32,$tmp0
499         and     $car1,$mask,$car1
500         add     $tmp0,$sbit,$sbit
501         add     $acc0,$car1,$car1
502         ld      [$tp+8],$tpj                    ! tp[j]
503         add     $acc1,$car1,$car1
504         st      $car1,[$tp]
505         srlx    $car1,32,$car1
506
507         add     $j,4,$j
508         cmp     $j,$num
509         be,pn   %icc,.Lsqr_no_inner2
510         add     $tp,4,$tp
511
512 .Lsqr_inner2:
513         mulx    $apj,$mul0,$acc0
514         mulx    $npj,$mul1,$acc1
515         add     $tpj,$sbit,$sbit
516         add     $acc0,$car0,$car0
517         ld      [$ap+$j],$apj                   ! ap[j]
518         and     $car0,$mask,$acc0
519         ld      [$np+$j],$npj                   ! np[j]
520         srlx    $car0,32,$car0
521         add     $acc0,$acc0,$acc0
522         ld      [$tp+8],$tpj                    ! tp[j]
523         add     $sbit,$acc0,$acc0
524         add     $j,4,$j                         ! j++
525         srlx    $acc0,32,$sbit
526         and     $acc0,$mask,$acc0
527         cmp     $j,$num
528         add     $acc0,$car1,$car1
529         add     $acc1,$car1,$car1
530         st      $car1,[$tp]                     ! tp[j-1]
531         srlx    $car1,32,$car1
532         bl      %icc,.Lsqr_inner2
533         add     $tp,4,$tp                       ! tp++
534
535 .Lsqr_no_inner2:
536         mulx    $apj,$mul0,$acc0
537         mulx    $npj,$mul1,$acc1
538         add     $tpj,$sbit,$sbit
539         add     $acc0,$car0,$car0
540         and     $car0,$mask,$acc0
541         srlx    $car0,32,$car0
542         add     $acc0,$acc0,$acc0
543         add     $sbit,$acc0,$acc0
544         srlx    $acc0,32,$sbit
545         and     $acc0,$mask,$acc0
546         add     $acc0,$car1,$car1
547         add     $acc1,$car1,$car1
548         st      $car1,[$tp]                     ! tp[j-1]
549         srlx    $car1,32,$car1
550
551         add     $car0,$car0,$car0
552         add     $sbit,$car0,$car0
553         add     $car0,$car1,$car1
554         add     $car2,$car1,$car1
555         st      $car1,[$tp+4]
556         srlx    $car1,32,$car2
557 \f
558         add     $i,4,$i                         ! i++
559         ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
560         ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
561         ld      [$ap+$i],$mul0                  ! ap[j]
562         ld      [$np],$car1                     ! np[0]
563         ld      [$np+4],$npj                    ! np[1]
564         mulx    $n0,$tmp1,$mul1
565         and     $mul1,$mask,$mul1
566         add     $i,4,$tmp0
567
568         mulx    $mul0,$mul0,$car0
569         mulx    $car1,$mul1,$car1
570         and     $car0,$mask,$acc0
571         add     $tmp1,$car1,$car1
572         srlx    $car0,32,$car0
573         add     %sp,$bias+$frame,$tp
574         srlx    $car1,32,$car1
575         and     $car0,1,$sbit
576         srlx    $car0,1,$car0
577
578         cmp     $tmp0,$num                      ! i<num-1
579         bl      %icc,.Lsqr_outer
580         mov     4,$j
581 \f
582 .Lsqr_last:
583         mulx    $npj,$mul1,$acc1
584         add     $tpj,$car1,$car1
585         add     $j,4,$j
586         ld      [$tp+8],$tpj
587         cmp     $j,$i
588         add     $acc1,$car1,$car1
589         ld      [$np+$j],$npj
590         st      $car1,[$tp]
591         srlx    $car1,32,$car1
592         bl      %icc,.Lsqr_last
593         add     $tp,4,$tp
594 !.Lsqr_last
595
596         mulx    $npj,$mul1,$acc1
597         add     $tpj,$acc0,$acc0
598         srlx    $acc0,32,$tmp0
599         and     $acc0,$mask,$acc0
600         add     $tmp0,$sbit,$sbit
601         add     $acc0,$car1,$car1
602         add     $acc1,$car1,$car1
603         st      $car1,[$tp]
604         srlx    $car1,32,$car1
605
606         add     $car0,$car0,$car0               ! recover $car0
607         add     $sbit,$car0,$car0
608         add     $car0,$car1,$car1
609         add     $car2,$car1,$car1
610         st      $car1,[$tp+4]
611         srlx    $car1,32,$car2
612
613         ba      .Ltail
614         add     $tp,8,$tp
615 .type   $fname,#function
616 .size   $fname,(.-$fname)
617 .asciz  "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
618 .align  32
619 ___
620 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
621 print $code;
622 close STDOUT or die "error closing STDOUT: $!";