-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# October 2012.
#
-# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
+# SPARCv9 VIS3 Montgomery multiplication procedure suitable for T3 and
# onward. There are three new instructions used here: umulxhi,
# addxc[cc] and initializing store. On T3 RSA private key operations
# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
# for reference purposes, because T4 has dedicated Montgomery
# multiplication and squaring *instructions* that deliver even more.
-$bits=32;
-for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64) { $bias=2047; $frame=192; }
-else { $bias=0; $frame=112; }
+$output = pop;
+open STDOUT,">$output";
+
+$frame = "STACK_FRAME";
+$bias = "STACK_BIAS";
-$code.=<<___ if ($bits==64);
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
-___
-$code.=<<___;
+#endif
+
.section ".text",#alloc,#execinstr
___
ld [$ap+12], $t3
or $t0, $aj, $aj
add $ap, 16, $ap
- stxa $aj, [$anp]0xe2 ! converted ap[0]
+ stx $aj, [$anp] ! converted ap[0]
mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
umulxhi $aj, $m0, $hi0
sllx $t1, 32, $aj
add $ap, 8, $ap
or $t0, $aj, $aj
- stxa $aj, [$anp]0xe2 ! converted ap[j]
+ stx $aj, [$anp] ! converted ap[j]
ld [$np+0], $t2 ! np[j]
addcc $nlo, $hi1, $lo1
addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
umulxhi $nj, $m1, $nj ! nhi=nj
addxc %g0, $hi1, $hi1
- stxa $lo1, [$tp]0xe2 ! tp[j-1]
+ stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp ! tp++
brnz,pt $cnt, .L1st
addxc $nj, %g0, $hi1
addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
addxc %g0, $hi1, $hi1
- stxa $lo1, [$tp]0xe2 ! tp[j-1]
+ stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp
addcc $hi0, $hi1, $hi1
addxc %g0, %g0, $ovf ! upmost overflow bit
- stxa $hi1, [$tp]0xe2
+ stx $hi1, [$tp]
add $tp, 8, $tp
\f
ba .Louter
sub $anp, $num, $anp
sub $rp, $num, $rp
- subc $ovf, %g0, $ovf ! handle upmost overflow bit
- and $tp, $ovf, $ap
- andn $rp, $ovf, $np
- or $np, $ap, $ap ! ap=borrow?tp:rp
+ subccc $ovf, %g0, $ovf ! handle upmost overflow bit
ba .Lcopy
sub $num, 8, $cnt
.align 16
-.Lcopy: ! copy or in-place refresh
- ld [$ap+0], $t2
- ld [$ap+4], $t3
- add $ap, 8, $ap
+.Lcopy: ! conditional copy
+ ld [$tp+0], $t0
+ ld [$tp+4], $t1
+ ld [$rp+0], $t2
+ ld [$rp+4], $t3
stx %g0, [$tp] ! zap
add $tp, 8, $tp
stx %g0, [$anp] ! zap
stx %g0, [$anp+8]
add $anp, 16, $anp
+ movcs %icc, $t0, $t2
+ movcs %icc, $t1, $t3
st $t3, [$rp+0] ! flip order
st $t2, [$rp+4]
add $rp, 8, $rp
\f
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
-# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis3 {