Following the license change, modify the boilerplates in crypto/bn/

[openssl.git] / crypto / bn / asm / vis3-mont.pl
diff --git a/crypto/bn/asm/vis3-mont.pl b/crypto/bn/asm/vis3-mont.pl

index a1357de0e99af4b55a2d26213de4c7164c0960ba..9e8f498f97be5729b9c062133a73fbf45f883bf4 100644 (file)
--- a/crypto/bn/asm/vis3-mont.pl
+++ b/crypto/bn/asm/vis3-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
  
  # ====================================================================
  # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -9,7 +16,7 @@
  
  # October 2012.
  #
-# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
+# SPARCv9 VIS3 Montgomery multiplication procedure suitable for T3 and
  # onward. There are three new instructions used here: umulxhi,
  # addxc[cc] and initializing store. On T3 RSA private key operations
  # are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
@@ -18,16 +25,20 @@
  # for reference purposes, because T4 has dedicated Montgomery
  # multiplication and squaring *instructions* that deliver even more.
  
-$bits=32;
-for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64)  { $bias=2047; $frame=192; }
-else            { $bias=0;    $frame=112; }
+$output = pop;
+open STDOUT,">$output";
+
+$frame = "STACK_FRAME";
+$bias = "STACK_BIAS";
  
-$code.=<<___ if ($bits==64);
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef __arch64__
  .register      %g2,#scratch
  .register      %g3,#scratch
-___
-$code.=<<___;
+#endif
+
  .section       ".text",#alloc,#execinstr
  ___
  
@@ -100,7 +111,7 @@ $code.=<<___;
         ld      [$ap+12],       $t3
         or      $t0,    $aj,    $aj
         add     $ap,    16,     $ap
-       stxa    $aj,    [$anp]0xe2      ! converted ap[0]
+       stx     $aj,    [$anp]          ! converted ap[0]
  
         mulx    $aj,    $m0,    $lo0    ! ap[0]*bp[0]
         umulxhi $aj,    $m0,    $hi0
@@ -150,7 +161,7 @@ $code.=<<___;
         sllx    $t1,    32,     $aj
         add     $ap,    8,      $ap
         or      $t0,    $aj,    $aj
-       stxa    $aj,    [$anp]0xe2      ! converted ap[j]
+       stx     $aj,    [$anp]          ! converted ap[j]
  
         ld      [$np+0],        $t2     ! np[j]
         addcc   $nlo,   $hi1,   $lo1
@@ -169,7 +180,7 @@ $code.=<<___;
         addcc   $lo0,   $lo1,   $lo1    ! np[j]*m1+ap[j]*bp[0]
         umulxhi $nj,    $m1,    $nj     ! nhi=nj
         addxc   %g0,    $hi1,   $hi1
-       stxa    $lo1,   [$tp]0xe2       ! tp[j-1]
+       stx     $lo1,   [$tp]           ! tp[j-1]
         add     $tp,    8,      $tp     ! tp++
  
         brnz,pt $cnt,   .L1st
@@ -182,12 +193,12 @@ $code.=<<___;
         addxc   $nj,    %g0,    $hi1
         addcc   $lo0,   $lo1,   $lo1    ! np[j]*m1+ap[j]*bp[0]
         addxc   %g0,    $hi1,   $hi1
-       stxa    $lo1,   [$tp]0xe2       ! tp[j-1]
+       stx     $lo1,   [$tp]           ! tp[j-1]
         add     $tp,    8,      $tp
  
         addcc   $hi0,   $hi1,   $hi1
         addxc   %g0,    %g0,    $ovf    ! upmost overflow bit
-       stxa    $hi1,   [$tp]0xe2
+       stx     $hi1,   [$tp]
         add     $tp,    8,      $tp
  \f
         ba      .Louter
@@ -299,23 +310,23 @@ $code.=<<___;
         sub     $anp,   $num,   $anp
         sub     $rp,    $num,   $rp
  
-       subc    $ovf,   %g0,    $ovf    ! handle upmost overflow bit
-       and     $tp,    $ovf,   $ap
-       andn    $rp,    $ovf,   $np
-       or      $np,    $ap,    $ap     ! ap=borrow?tp:rp
+       subccc  $ovf,   %g0,    $ovf    ! handle upmost overflow bit
         ba      .Lcopy
         sub     $num,   8,      $cnt
  
  .align 16
-.Lcopy:                                        ! copy or in-place refresh
-       ld      [$ap+0],        $t2
-       ld      [$ap+4],        $t3
-       add     $ap,    8,      $ap
+.Lcopy:                                        ! conditional copy
+       ld      [$tp+0],        $t0
+       ld      [$tp+4],        $t1
+       ld      [$rp+0],        $t2
+       ld      [$rp+4],        $t3
         stx     %g0,    [$tp]           ! zap
         add     $tp,    8,      $tp
         stx     %g0,    [$anp]          ! zap
         stx     %g0,    [$anp+8]
         add     $anp,   16,     $anp
+       movcs   %icc,   $t0,    $t2
+       movcs   %icc,   $t1,    $t3
         st      $t3,    [$rp+0]         ! flip order
         st      $t2,    [$rp+4]
         add     $rp,    8,      $rp
@@ -333,7 +344,7 @@ ___
  \f
  # Purpose of these subroutines is to explicitly encode VIS instructions,
  # so that one can compile the module without having to specify VIS
-# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
  # Idea is to reserve for option to produce "universal" binary and let
  # programmer detect if current CPU is VIS capable at run-time.
  sub unvis3 {