X-Git-Url: https://git.openssl.org/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fbn%2Fasm%2Fsparcv9-mont.pl;h=2697965b3f28050f64f1f5b713038a0d5896bd35;hp=4f922c3110e1230c472b5e6b85d3dfc7a9059ffc;hb=120a9e1a825bd0407639bedb1e8e15823cf7a545;hpb=eb77e8886df84526f42f566632be71d4ed373308 diff --git a/crypto/bn/asm/sparcv9-mont.pl b/crypto/bn/asm/sparcv9-mont.pl index 4f922c3110..2697965b3f 100644 --- a/crypto/bn/asm/sparcv9-mont.pl +++ b/crypto/bn/asm/sparcv9-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov for the OpenSSL @@ -13,7 +20,7 @@ # for undertaken effort are multiple. First of all, UltraSPARC is not # the whole SPARCv9 universe and other VIS-free implementations deserve # optimized code as much. Secondly, newly introduced UltraSPARC T1, -# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, +# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths, # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with # several integrated RSA/DSA accelerator circuits accessible through # kernel driver [only(*)], but having decent user-land software @@ -293,7 +300,7 @@ ___ ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over ######## code without following dedicated squaring procedure. ######## -$sbit="%i2"; # re-use $bp! +$sbit="%o5"; $code.=<<___; .align 32 @@ -406,7 +413,7 @@ $code.=<<___; mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $acc0,$car0,$car0 - add $tpj,$car1,$car1 + add $tpj,$sbit,$sbit ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 ld [$np+$j],$npj ! np[j] @@ -415,7 +422,7 @@ $code.=<<___; ld [$tp+8],$tpj ! tp[j] add $acc0,$acc0,$acc0 add $j,4,$j ! j++ - or $sbit,$acc0,$acc0 + add $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 cmp $j,$num @@ -429,12 +436,12 @@ $code.=<<___; mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $acc0,$car0,$car0 - add $tpj,$car1,$car1 + add $tpj,$sbit,$sbit and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc1,$car1,$car1 add $acc0,$acc0,$acc0 - or $sbit,$acc0,$acc0 + add $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 @@ -442,7 +449,7 @@ $code.=<<___; srlx $car1,32,$car1 add $car0,$car0,$car0 - or $sbit,$car0,$car0 + add $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] @@ -502,7 +509,7 @@ $code.=<<___; .Lsqr_inner2: mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 - add $tpj,$car1,$car1 + add $tpj,$sbit,$sbit add $acc0,$car0,$car0 ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 @@ -510,7 +517,7 @@ $code.=<<___; srlx $car0,32,$car0 add $acc0,$acc0,$acc0 ld [$tp+8],$tpj ! tp[j] - or $sbit,$acc0,$acc0 + add $sbit,$acc0,$acc0 add $j,4,$j ! j++ srlx $acc0,32,$sbit and $acc0,$mask,$acc0 @@ -525,12 +532,12 @@ $code.=<<___; .Lsqr_no_inner2: mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 - add $tpj,$car1,$car1 + add $tpj,$sbit,$sbit add $acc0,$car0,$car0 and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc0,$acc0,$acc0 - or $sbit,$acc0,$acc0 + add $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 @@ -539,7 +546,7 @@ $code.=<<___; srlx $car1,32,$car1 add $car0,$car0,$car0 - or $sbit,$car0,$car0 + add $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] @@ -584,14 +591,17 @@ $code.=<<___; !.Lsqr_last mulx $npj,$mul1,$acc1 - add $tpj,$car1,$car1 + add $tpj,$acc0,$acc0 + srlx $acc0,32,$tmp0 + and $acc0,$mask,$acc0 + add $tmp0,$sbit,$sbit add $acc0,$car1,$car1 add $acc1,$car1,$car1 st $car1,[$tp] srlx $car1,32,$car1 add $car0,$car0,$car0 ! recover $car0 - or $sbit,$car0,$car0 + add $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4]