projects
/
openssl.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
bn/asm/sparcv9-mont.pl: fix squaring code path.
[openssl.git]
/
crypto
/
bn
/
asm
/
sparcv9-mont.pl
diff --git
a/crypto/bn/asm/sparcv9-mont.pl
b/crypto/bn/asm/sparcv9-mont.pl
index 2e12eeb578e8232fa477cb891404ad21508890b4..2697965b3f28050f64f1f5b713038a0d5896bd35 100644
(file)
--- a/
crypto/bn/asm/sparcv9-mont.pl
+++ b/
crypto/bn/asm/sparcv9-mont.pl
@@
-1,9
+1,17
@@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# December 2005
# ====================================================================
# December 2005
@@
-12,7
+20,7
@@
# for undertaken effort are multiple. First of all, UltraSPARC is not
# the whole SPARCv9 universe and other VIS-free implementations deserve
# optimized code as much. Secondly, newly introduced UltraSPARC T1,
# for undertaken effort are multiple. First of all, UltraSPARC is not
# the whole SPARCv9 universe and other VIS-free implementations deserve
# optimized code as much. Secondly, newly introduced UltraSPARC T1,
-# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive path
e
s,
+# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
# several integrated RSA/DSA accelerator circuits accessible through
# kernel driver [only(*)], but having decent user-land software
# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
# several integrated RSA/DSA accelerator circuits accessible through
# kernel driver [only(*)], but having decent user-land software
@@
-41,6
+49,9
@@
# module still have hidden potential [see TODO list there], which is
# estimated to be larger than 20%...
# module still have hidden potential [see TODO list there], which is
# estimated to be larger than 20%...
+$output = pop;
+open STDOUT,">$output";
+
# int bn_mul_mont(
$rp="%i0"; # BN_ULONG *rp,
$ap="%i1"; # const BN_ULONG *ap,
# int bn_mul_mont(
$rp="%i0"; # BN_ULONG *rp,
$ap="%i1"; # const BN_ULONG *ap,
@@
-49,10
+60,8
@@
$np="%i3"; # const BN_ULONG *np,
$n0="%i4"; # const BN_ULONG *n0,
$num="%i5"; # int num);
$n0="%i4"; # const BN_ULONG *n0,
$num="%i5"; # int num);
-$bits=32;
-for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64) { $bias=2047; $frame=192; }
-else { $bias=0; $frame=128; }
+$frame="STACK_FRAME";
+$bias="STACK_BIAS";
$car0="%o0";
$car1="%o1";
$car0="%o0";
$car1="%o1";
@@
-75,6
+84,8
@@
$tpj="%l7";
$fname="bn_mul_mont_int";
$code=<<___;
$fname="bn_mul_mont_int";
$code=<<___;
+#include "sparc_arch.h"
+
.section ".text",#alloc,#execinstr
.global $fname
.section ".text",#alloc,#execinstr
.global $fname
@@
-94,17
+105,17
@@
$fname:
cmp $ap,$bp
and $num,$mask,$num
ld [$bp],$mul0 ! bp[0]
cmp $ap,$bp
and $num,$mask,$num
ld [$bp],$mul0 ! bp[0]
- be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
nop
add %sp,$bias,%o7 ! real top of stack
nop
add %sp,$bias,%o7 ! real top of stack
- ld [$ap],$car0 ! ap[0]
+ ld [$ap],$car0 ! ap[0]
! redundant in squaring context
sub %o7,$num,%o7
ld [$ap+4],$apj ! ap[1]
and %o7,-1024,%o7
ld [$np],$car1 ! np[0]
sub %o7,$bias,%sp ! alloca
ld [$np+4],$npj ! np[1]
sub %o7,$num,%o7
ld [$ap+4],$apj ! ap[1]
and %o7,-1024,%o7
ld [$np],$car1 ! np[0]
sub %o7,$bias,%sp ! alloca
ld [$np+4],$npj ! np[1]
+ be,pt SIZE_T_CC,.Lbn_sqr_mont
mov 12,$j
mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
mov 12,$j
mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
@@
-254,44
+265,32
@@
$fname:
.Ltail:
add $np,$num,$np
add $rp,$num,$rp
.Ltail:
add $np,$num,$np
add $rp,$num,$rp
-
- cmp $car2,0 ! clears %icc.c
- bne,pn %icc,.Lsub
+ mov $tp,$ap
sub %g0,$num,%o7 ! k=-num
sub %g0,$num,%o7 ! k=-num
-
- cmp $car1,$npj ! compare top-most $tp and $np words
- bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
- nop
-
-.align 16,0x1000000
+ ba .Lsub
+ subcc %g0,%g0,%g0 ! clear %icc.c
+.align 16
.Lsub:
ld [$tp+%o7],%o0
ld [$np+%o7],%o1
.Lsub:
ld [$tp+%o7],%o0
ld [$np+%o7],%o1
- subccc %o0,%o1,%o1
- st %o1,[$rp+%o7]
+ subccc %o0,%o1,%o1
! tp[j]-np[j]
+ add $rp,%o7,$i
add %o7,4,%o7
brnz %o7,.Lsub
add %o7,4,%o7
brnz %o7,.Lsub
- nop
- subccc $car2,0,$car2
- bcc %icc,.Lzap
+ st %o1,[$i]
+ subc $car2,0,$car2 ! handle upmost overflow bit
+ and $tp,$car2,$ap
+ andn $rp,$car2,$np
+ or $ap,$np,$ap
sub %g0,$num,%o7
sub %g0,$num,%o7
-.align 16,0x1000000
.Lcopy:
.Lcopy:
- ld [$tp+%o7],%o0
+ ld [$ap+%o7],%o0 ! copy or in-place refresh
+ st %g0,[$tp+%o7] ! zap tp
st %o0,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lcopy
nop
st %o0,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lcopy
nop
- ba .Lzap
- sub %g0,$num,%o7
-
-.align 32
-.Lzap:
- st %g0,[$tp+%o7]
- add %o7,4,%o7
- brnz %o7,.Lzap
- nop
mov 1,%i0
ret
restore
mov 1,%i0
ret
restore
@@
-301,20
+300,11
@@
___
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
######## code without following dedicated squaring procedure.
########
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
######## code without following dedicated squaring procedure.
########
-$sbit="%
i2"; # re-use $bp!
+$sbit="%
o5";
$code.=<<___;
.align 32
.Lbn_sqr_mont:
$code.=<<___;
.align 32
.Lbn_sqr_mont:
- add %sp,$bias,%o7 ! real top of stack
- ld [$ap+4],$apj ! ap[1]
- sub %o7,$num,%o7
- ld [$np],$car1 ! np[0]
- and %o7,-1024,%o7
- ld [$np+4],$npj ! np[1]
- sub %o7,$bias,%sp ! alloca
- mov 12,$j
-
mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
mulx $apj,$mul0,$tmp0 !prologue!
and $car0,$mask,$acc0
mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
mulx $apj,$mul0,$tmp0 !prologue!
and $car0,$mask,$acc0
@@
-423,7
+413,7
@@
$code.=<<___;
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
- add $tpj,$
car1,$car1
+ add $tpj,$
sbit,$sbit
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
@@
-432,7
+422,7
@@
$code.=<<___;
ld [$tp+8],$tpj ! tp[j]
add $acc0,$acc0,$acc0
add $j,4,$j ! j++
ld [$tp+8],$tpj ! tp[j]
add $acc0,$acc0,$acc0
add $j,4,$j ! j++
-
or
$sbit,$acc0,$acc0
+
add
$sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
@@
-446,12
+436,12
@@
$code.=<<___;
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
- add $tpj,$
car1,$car1
+ add $tpj,$
sbit,$sbit
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc1,$car1,$car1
add $acc0,$acc0,$acc0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc1,$car1,$car1
add $acc0,$acc0,$acc0
-
or
$sbit,$acc0,$acc0
+
add
$sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
@@
-459,7
+449,7
@@
$code.=<<___;
srlx $car1,32,$car1
add $car0,$car0,$car0
srlx $car1,32,$car1
add $car0,$car0,$car0
-
or
$sbit,$car0,$car0
+
add
$sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
@@
-519,7
+509,7
@@
$code.=<<___;
.Lsqr_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
.Lsqr_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
- add $tpj,$
car1,$car1
+ add $tpj,$
sbit,$sbit
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
@@
-527,7
+517,7
@@
$code.=<<___;
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
ld [$tp+8],$tpj ! tp[j]
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
ld [$tp+8],$tpj ! tp[j]
-
or
$sbit,$acc0,$acc0
+
add
$sbit,$acc0,$acc0
add $j,4,$j ! j++
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $j,4,$j ! j++
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
@@
-542,12
+532,12
@@
$code.=<<___;
.Lsqr_no_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
.Lsqr_no_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
- add $tpj,$
car1,$car1
+ add $tpj,$
sbit,$sbit
add $acc0,$car0,$car0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
add $acc0,$car0,$car0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
-
or
$sbit,$acc0,$acc0
+
add
$sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
@@
-556,7
+546,7
@@
$code.=<<___;
srlx $car1,32,$car1
add $car0,$car0,$car0
srlx $car1,32,$car1
add $car0,$car0,$car0
-
or
$sbit,$car0,$car0
+
add
$sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
@@
-601,14
+591,17
@@
$code.=<<___;
!.Lsqr_last
mulx $npj,$mul1,$acc1
!.Lsqr_last
mulx $npj,$mul1,$acc1
- add $tpj,$car1,$car1
+ add $tpj,$acc0,$acc0
+ srlx $acc0,32,$tmp0
+ and $acc0,$mask,$acc0
+ add $tmp0,$sbit,$sbit
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $car0,$car0,$car0 ! recover $car0
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $car0,$car0,$car0 ! recover $car0
-
or
$sbit,$car0,$car0
+
add
$sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
@@
-618,6
+611,8
@@
$code.=<<___;
add $tp,8,$tp
.type $fname,#function
.size $fname,(.-$fname)
add $tp,8,$tp
.type $fname,#function
.size $fname,(.-$fname)
+.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align 32
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;