Split bignum code out of the sparcv9cap.c

[openssl.git] / crypto / bn / asm / sparcv9-mont.pl
diff --git a/crypto/bn/asm/sparcv9-mont.pl b/crypto/bn/asm/sparcv9-mont.pl

index 0339bfe7f3a82a7c298c26de495bd0e923994e14..ac93ef9ee64205a8e5b11a07f2f90bc5ec722fd4 100644 (file)
--- a/crypto/bn/asm/sparcv9-mont.pl
+++ b/crypto/bn/asm/sparcv9-mont.pl
@@ -1,9 +1,17 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
  
  # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
  # ====================================================================
  
  # December 2005
@@ -12,7 +20,7 @@
  # for undertaken effort are multiple. First of all, UltraSPARC is not
  # the whole SPARCv9 universe and other VIS-free implementations deserve
  # optimized code as much. Secondly, newly introduced UltraSPARC T1,
-# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
+# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
  # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
  # several integrated RSA/DSA accelerator circuits accessible through
  # kernel driver [only(*)], but having decent user-land software
@@ -22,7 +30,7 @@
  # instructions...
  
  # (*)  Engine accessing the driver in question is on my TODO list.
-#      For reference, acceleator is estimated to give 6 to 10 times
+#      For reference, accelerator is estimated to give 6 to 10 times
  #      improvement on single-threaded RSA sign. It should be noted
  #      that 6-10x improvement coefficient does not actually mean
  #      something extraordinary in terms of absolute [single-threaded]
@@ -41,6 +49,8 @@
  # module still have hidden potential [see TODO list there], which is
  # estimated to be larger than 20%...
  
+$output = pop and open STDOUT,">$output";
+
  # int bn_mul_mont(
  $rp="%i0";     # BN_ULONG *rp,
  $ap="%i1";     # const BN_ULONG *ap,
@@ -49,10 +59,8 @@ $np="%i3";   # const BN_ULONG *np,
  $n0="%i4";     # const BN_ULONG *n0,
  $num="%i5";    # int num);
  
-$bits=32;
-for (@ARGV)    { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64) { $bias=2047; $frame=192; }
-else           { $bias=0;    $frame=128; }
+$frame="STACK_FRAME";
+$bias="STACK_BIAS";
  
  $car0="%o0";
  $car1="%o1";
@@ -72,9 +80,14 @@ $apj="%l5";
  $npj="%l6";
  $tpj="%l7";
  
-$fname="bn_mul_mont";
+$fname="bn_mul_mont_int";
  
  $code=<<___;
+#ifndef __ASSEMBLER__
+# define __ASSEMBLER__ 1
+#endif
+#include "crypto/sparc_arch.h"
+
  .section       ".text",#alloc,#execinstr
  
  .global        $fname
@@ -94,17 +107,17 @@ $fname:
         cmp     $ap,$bp
         and     $num,$mask,$num
         ld      [$bp],$mul0             ! bp[0]
-       be,pt   `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
         nop
  
         add     %sp,$bias,%o7           ! real top of stack
-       ld      [$ap],$car0             ! ap[0]
+       ld      [$ap],$car0             ! ap[0] ! redundant in squaring context
         sub     %o7,$num,%o7
         ld      [$ap+4],$apj            ! ap[1]
         and     %o7,-1024,%o7
         ld      [$np],$car1             ! np[0]
         sub     %o7,$bias,%sp           ! alloca
         ld      [$np+4],$npj            ! np[1]
+       be,pt   SIZE_T_CC,.Lbn_sqr_mont
         mov     12,$j
  
         mulx    $car0,$mul0,$car0       ! ap[0]*bp[0]
@@ -254,66 +267,44 @@ $fname:
  .Ltail:
         add     $np,$num,$np
         add     $rp,$num,$rp
-
-       cmp     $car2,0                 ! clears %icc.c
-       bne,pn  %icc,.Lsub
         sub     %g0,$num,%o7            ! k=-num
-
-       cmp     $car1,$npj              ! compare top-most $tp and $np words
-       bcs,pt  %icc,.Lcopy             ! %icc.c is clean if not taken
-       nop
-
-.align 16,0x1000000
+       ba      .Lsub
+       subcc   %g0,%g0,%g0             ! clear %icc.c
+.align 16
  .Lsub:
         ld      [$tp+%o7],%o0
         ld      [$np+%o7],%o1
-       subccc  %o0,%o1,%o1
-       st      %o1,[$rp+%o7]
+       subccc  %o0,%o1,%o1             ! tp[j]-np[j]
+       add     $rp,%o7,$i
         add     %o7,4,%o7
         brnz    %o7,.Lsub
-       nop
-       subccc  $car2,0,$car2
-       bcc     %icc,.Lzap
+       st      %o1,[$i]
+       subccc  $car2,0,$car2           ! handle upmost overflow bit
         sub     %g0,$num,%o7
  
-.align 16,0x1000000
  .Lcopy:
-       ld      [$tp+%o7],%o0
+       ld      [$tp+%o7],%o1           ! conditional copy
+       ld      [$rp+%o7],%o0
+       st      %g0,[$tp+%o7]           ! zap tp
+       movcs   %icc,%o1,%o0
         st      %o0,[$rp+%o7]
         add     %o7,4,%o7
         brnz    %o7,.Lcopy
         nop
-       ba      .Lzap
-       sub     %g0,$num,%o7
-
-.align 32
-.Lzap:
-       st      %g0,[$tp+%o7]
-       add     %o7,4,%o7
-       brnz    %o7,.Lzap
-       nop
         mov     1,%i0
         ret
         restore
  ___
  \f
  ########
-######## bn_sqr_mont gives up to 20% improvement over above code
+######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
+######## code without following dedicated squaring procedure.
  ########
-$sbit="%i2";           # re-use $bp!
+$sbit="%o5";
  
  $code.=<<___;
  .align 32
  .Lbn_sqr_mont:
-       add     %sp,$bias,%o7                   ! real top of stack
-       ld      [$ap+4],$apj                    ! ap[1]
-       sub     %o7,$num,%o7
-       ld      [$np],$car1                     ! np[0]
-       and     %o7,-1024,%o7
-       ld      [$np+4],$npj                    ! np[1]
-       sub     %o7,$bias,%sp                   ! alloca
-       mov     12,$j
-
         mulx    $mul0,$mul0,$car0               ! ap[0]*ap[0]
         mulx    $apj,$mul0,$tmp0                !prologue!
         and     $car0,$mask,$acc0
@@ -422,7 +413,7 @@ $code.=<<___;
         mulx    $apj,$mul0,$acc0
         mulx    $npj,$mul1,$acc1
         add     $acc0,$car0,$car0
-       add     $tpj,$car1,$car1
+       add     $tpj,$sbit,$sbit
         ld      [$ap+$j],$apj                   ! ap[j]
         and     $car0,$mask,$acc0
         ld      [$np+$j],$npj                   ! np[j]
@@ -431,7 +422,7 @@ $code.=<<___;
         ld      [$tp+8],$tpj                    ! tp[j]
         add     $acc0,$acc0,$acc0
         add     $j,4,$j                         ! j++
-       or      $sbit,$acc0,$acc0
+       add     $sbit,$acc0,$acc0
         srlx    $acc0,32,$sbit
         and     $acc0,$mask,$acc0
         cmp     $j,$num
@@ -445,12 +436,12 @@ $code.=<<___;
         mulx    $apj,$mul0,$acc0
         mulx    $npj,$mul1,$acc1
         add     $acc0,$car0,$car0
-       add     $tpj,$car1,$car1
+       add     $tpj,$sbit,$sbit
         and     $car0,$mask,$acc0
         srlx    $car0,32,$car0
         add     $acc1,$car1,$car1
         add     $acc0,$acc0,$acc0
-       or      $sbit,$acc0,$acc0
+       add     $sbit,$acc0,$acc0
         srlx    $acc0,32,$sbit
         and     $acc0,$mask,$acc0
         add     $acc0,$car1,$car1
@@ -458,7 +449,7 @@ $code.=<<___;
         srlx    $car1,32,$car1
  
         add     $car0,$car0,$car0
-       or      $sbit,$car0,$car0
+       add     $sbit,$car0,$car0
         add     $car0,$car1,$car1
         add     $car2,$car1,$car1
         st      $car1,[$tp+4]
@@ -504,6 +495,9 @@ $code.=<<___;
         mulx    $npj,$mul1,$acc1
         add     $tpj,$car1,$car1
         ld      [$np+$j],$npj                   ! np[j]
+       srlx    $car1,32,$tmp0
+       and     $car1,$mask,$car1
+       add     $tmp0,$sbit,$sbit
         add     $acc0,$car1,$car1
         ld      [$tp+8],$tpj                    ! tp[j]
         add     $acc1,$car1,$car1
@@ -518,7 +512,7 @@ $code.=<<___;
  .Lsqr_inner2:
         mulx    $apj,$mul0,$acc0
         mulx    $npj,$mul1,$acc1
-       add     $tpj,$car1,$car1
+       add     $tpj,$sbit,$sbit
         add     $acc0,$car0,$car0
         ld      [$ap+$j],$apj                   ! ap[j]
         and     $car0,$mask,$acc0
@@ -526,7 +520,7 @@ $code.=<<___;
         srlx    $car0,32,$car0
         add     $acc0,$acc0,$acc0
         ld      [$tp+8],$tpj                    ! tp[j]
-       or      $sbit,$acc0,$acc0
+       add     $sbit,$acc0,$acc0
         add     $j,4,$j                         ! j++
         srlx    $acc0,32,$sbit
         and     $acc0,$mask,$acc0
@@ -541,12 +535,12 @@ $code.=<<___;
  .Lsqr_no_inner2:
         mulx    $apj,$mul0,$acc0
         mulx    $npj,$mul1,$acc1
-       add     $tpj,$car1,$car1
+       add     $tpj,$sbit,$sbit
         add     $acc0,$car0,$car0
         and     $car0,$mask,$acc0
         srlx    $car0,32,$car0
         add     $acc0,$acc0,$acc0
-       or      $sbit,$acc0,$acc0
+       add     $sbit,$acc0,$acc0
         srlx    $acc0,32,$sbit
         and     $acc0,$mask,$acc0
         add     $acc0,$car1,$car1
@@ -555,7 +549,7 @@ $code.=<<___;
         srlx    $car1,32,$car1
  
         add     $car0,$car0,$car0
-       or      $sbit,$car0,$car0
+       add     $sbit,$car0,$car0
         add     $car0,$car1,$car1
         add     $car2,$car1,$car1
         st      $car1,[$tp+4]
@@ -600,14 +594,17 @@ $code.=<<___;
  !.Lsqr_last
  
         mulx    $npj,$mul1,$acc1
-       add     $tpj,$car1,$car1
+       add     $tpj,$acc0,$acc0
+       srlx    $acc0,32,$tmp0
+       and     $acc0,$mask,$acc0
+       add     $tmp0,$sbit,$sbit
         add     $acc0,$car1,$car1
         add     $acc1,$car1,$car1
         st      $car1,[$tp]
         srlx    $car1,32,$car1
  
         add     $car0,$car0,$car0               ! recover $car0
-       or      $sbit,$car0,$car0
+       add     $sbit,$car0,$car0
         add     $car0,$car1,$car1
         add     $car2,$car1,$car1
         st      $car1,[$tp+4]
@@ -617,7 +614,9 @@ $code.=<<___;
         add     $tp,8,$tp
  .type  $fname,#function
  .size  $fname,(.-$fname)
+.asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align 32
  ___
  $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  print $code;
-close STDOUT;
+close STDOUT or die "error closing STDOUT: $!";