bn/asm/ppc-mont.pl: prepare for extension.

[openssl.git] / crypto / bn / asm / ppc-mont.pl
diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl

index b69809a97e81067b2bdbe839ede100662936d6b3..8676567cc2595697ca06b87061fa0ab1c29ea488 100644 (file)
--- a/crypto/bn/asm/ppc-mont.pl
+++ b/crypto/bn/asm/ppc-mont.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
  
  # ====================================================================
  # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -19,19 +26,18 @@
  # So far RSA *sign* performance improvement over pre-bn_mul_mont asm
  # for 64-bit application running on PPC970/G5 is:
  #
-# 512-bit      +65%    
+# 512-bit      +65%
  # 1024-bit     +35%
  # 2048-bit     +18%
  # 4096-bit     +4%
  
-$output = shift;
+$flavour = shift;
  
-if ($output =~ /32\-mont\.s/) {
+if ($flavour =~ /32/) {
         $BITS=  32;
         $BNSZ=  $BITS/8;
         $SIZE_T=4;
         $RZONE= 224;
-       $FRAME= $SIZE_T*16;
  
         $LD=    "lwz";          # load
         $LDU=   "lwzu";         # load and update
@@ -43,15 +49,14 @@ if ($output =~ /32\-mont\.s/) {
         $UMULL= "mullw";        # unsigned multiply low
         $UMULH= "mulhwu";       # unsigned multiply high
         $UCMP=  "cmplw";        # unsigned compare
-       $SHRI=  "srwi";         # unsigned shift right by immediate     
+       $SHRI=  "srwi";         # unsigned shift right by immediate
         $PUSH=  $ST;
         $POP=   $LD;
-} elsif ($output =~ /64\-mont\.s/) {
+} elsif ($flavour =~ /64/) {
         $BITS=  64;
         $BNSZ=  $BITS/8;
         $SIZE_T=8;
         $RZONE= 288;
-       $FRAME= $SIZE_T*16;
  
         # same as above, but 64-bit mnemonics...
         $LD=    "ld";           # load
@@ -64,82 +69,98 @@ if ($output =~ /32\-mont\.s/) {
         $UMULL= "mulld";        # unsigned multiply low
         $UMULH= "mulhdu";       # unsigned multiply high
         $UCMP=  "cmpld";        # unsigned compare
-       $SHRI=  "srdi";         # unsigned shift right by immediate     
+       $SHRI=  "srdi";         # unsigned shift right by immediate
         $PUSH=  $ST;
         $POP=   $LD;
-} else { die "nonsense $output"; }
+} else { die "nonsense $flavour"; }
+
+$FRAME=8*$SIZE_T+$RZONE;
+$LOCALS=8*$SIZE_T;
  
-( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
-       die "can't call ../perlasm/ppc-xlate.pl: $!";
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  
  $sp="r1";
  $toc="r2";
-$rp="r3";      $ovf="r3";
+$rp="r3";
  $ap="r4";
  $bp="r5";
  $np="r6";
  $n0="r7";
  $num="r8";
-$rp="r9";      # $rp is reassigned
-$aj="r10";
-$nj="r11";
-$tj="r12";
+
+{
+my $ovf=$rp;
+my $rp="r9";   # $rp is reassigned
+my $aj="r10";
+my $nj="r11";
+my $tj="r12";
  # non-volatile registers
-$i="r14";
-$j="r15";
-$tp="r16";
-$m0="r17";
-$m1="r18";
-$lo0="r19";
-$hi0="r20";
-$lo1="r21";
-$hi1="r22";
-$alo="r23";
-$ahi="r24";
-$nlo="r25";
+my $i="r20";
+my $j="r21";
+my $tp="r22";
+my $m0="r23";
+my $m1="r24";
+my $lo0="r25";
+my $hi0="r26";
+my $lo1="r27";
+my $hi1="r28";
+my $alo="r29";
+my $ahi="r30";
+my $nlo="r31";
  #
-$nhi="r0";
+my $nhi="r0";
  
  $code=<<___;
  .machine "any"
  .text
  
-.globl .bn_mul_mont
+.globl .bn_mul_mont_int
  .align 4
-.bn_mul_mont:
+.bn_mul_mont_int:
         cmpwi   $num,4
         mr      $rp,r3          ; $rp is reassigned
         li      r3,0
         bltlr
-
+___
+$code.=<<___ if ($BNSZ==4);
+       cmpwi   $num,32         ; longer key performance is not better
+       bgelr
+___
+$code.=<<___;
         slwi    $num,$num,`log($BNSZ)/log(2)`
         li      $tj,-4096
-       addi    $ovf,$num,`$FRAME+$RZONE`
+       addi    $ovf,$num,$FRAME
         subf    $ovf,$ovf,$sp   ; $sp-$ovf
         and     $ovf,$ovf,$tj   ; minimize TLB usage
         subf    $ovf,$sp,$ovf   ; $ovf-$sp
+       mr      $tj,$sp
         srwi    $num,$num,`log($BNSZ)/log(2)`
         $STUX   $sp,$sp,$ovf
  
-       $PUSH   r14,`4*$SIZE_T`($sp)
-       $PUSH   r15,`5*$SIZE_T`($sp)
-       $PUSH   r16,`6*$SIZE_T`($sp)
-       $PUSH   r17,`7*$SIZE_T`($sp)
-       $PUSH   r18,`8*$SIZE_T`($sp)
-       $PUSH   r19,`9*$SIZE_T`($sp)
-       $PUSH   r20,`10*$SIZE_T`($sp)
-       $PUSH   r21,`11*$SIZE_T`($sp)
-       $PUSH   r22,`12*$SIZE_T`($sp)
-       $PUSH   r23,`13*$SIZE_T`($sp)
-       $PUSH   r24,`14*$SIZE_T`($sp)
-       $PUSH   r25,`15*$SIZE_T`($sp)
+       $PUSH   r20,`-12*$SIZE_T`($tj)
+       $PUSH   r21,`-11*$SIZE_T`($tj)
+       $PUSH   r22,`-10*$SIZE_T`($tj)
+       $PUSH   r23,`-9*$SIZE_T`($tj)
+       $PUSH   r24,`-8*$SIZE_T`($tj)
+       $PUSH   r25,`-7*$SIZE_T`($tj)
+       $PUSH   r26,`-6*$SIZE_T`($tj)
+       $PUSH   r27,`-5*$SIZE_T`($tj)
+       $PUSH   r28,`-4*$SIZE_T`($tj)
+       $PUSH   r29,`-3*$SIZE_T`($tj)
+       $PUSH   r30,`-2*$SIZE_T`($tj)
+       $PUSH   r31,`-1*$SIZE_T`($tj)
  
         $LD     $n0,0($n0)      ; pull n0[0] value
         addi    $num,$num,-2    ; adjust $num for counter register
  \f
         $LD     $m0,0($bp)      ; m0=bp[0]
         $LD     $aj,0($ap)      ; ap[0]
-       addi    $tp,$sp,$FRAME
+       addi    $tp,$sp,$LOCALS
         $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[0]
         $UMULH  $hi0,$aj,$m0
  
@@ -180,7 +201,7 @@ L1st:
  
         addi    $j,$j,$BNSZ     ; j++
         addi    $tp,$tp,$BNSZ   ; tp++
-       bdnz-   L1st
+       bdnz    L1st
  ;L1st
         addc    $lo0,$alo,$hi0
         addze   $hi0,$ahi
@@ -201,8 +222,8 @@ L1st:
  Louter:
         $LDX    $m0,$bp,$i      ; m0=bp[i]
         $LD     $aj,0($ap)      ; ap[0]
-       addi    $tp,$sp,$FRAME
-       $LD     $tj,$FRAME($sp) ; tp[0]
+       addi    $tp,$sp,$LOCALS
+       $LD     $tj,$LOCALS($sp); tp[0]
         $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[i]
         $UMULH  $hi0,$aj,$m0
         $LD     $aj,$BNSZ($ap)  ; ap[1]
@@ -242,7 +263,7 @@ Linner:
         addze   $hi1,$hi1
         $ST     $lo1,0($tp)     ; tp[j-1]
         addi    $tp,$tp,$BNSZ   ; tp++
-       bdnz-   Linner
+       bdnz    Linner
  ;Linner
         $LD     $tj,$BNSZ($tp)  ; tp[j]
         addc    $lo0,$alo,$hi0
@@ -265,15 +286,12 @@ Linner:
         slwi    $tj,$num,`log($BNSZ)/log(2)`
         $UCMP   $i,$tj
         addi    $i,$i,$BNSZ
-       ble-    Louter
+       ble     Louter
  \f
-       $SHRI.  $nj,$nj,$BITS-2 ; check boundary condition
         addi    $num,$num,2     ; restore $num
         subfc   $j,$j,$j        ; j=0 and "clear" XER[CA]
-       addi    $tp,$sp,$FRAME
-       addi    $ap,$sp,$FRAME
+       addi    $tp,$sp,$LOCALS
         mtctr   $num
-       beq     Lcopy           ; boundary condition is met
  
  .align 4
  Lsub:  $LDX    $tj,$tp,$j
@@ -281,7 +299,7 @@ Lsub:       $LDX    $tj,$tp,$j
         subfe   $aj,$nj,$tj     ; tp[j]-np[j]
         $STX    $aj,$rp,$j
         addi    $j,$j,$BNSZ
-       bdnz-   Lsub
+       bdnz    Lsub
  
         li      $j,0
         mtctr   $num
@@ -296,25 +314,32 @@ Lcopy:                            ; copy or in-place refresh
         $STX    $tj,$rp,$j
         $STX    $j,$tp,$j       ; zap at once
         addi    $j,$j,$BNSZ
-       bdnz-   Lcopy
-
-       $POP    r14,`4*$SIZE_T`($sp)
-       $POP    r15,`5*$SIZE_T`($sp)
-       $POP    r16,`6*$SIZE_T`($sp)
-       $POP    r17,`7*$SIZE_T`($sp)
-       $POP    r18,`8*$SIZE_T`($sp)
-       $POP    r19,`9*$SIZE_T`($sp)
-       $POP    r20,`10*$SIZE_T`($sp)
-       $POP    r21,`11*$SIZE_T`($sp)
-       $POP    r22,`12*$SIZE_T`($sp)
-       $POP    r23,`13*$SIZE_T`($sp)
-       $POP    r24,`14*$SIZE_T`($sp)
-       $POP    r25,`15*$SIZE_T`($sp)
-       $POP    $sp,0($sp)
+       bdnz    Lcopy
+
+       $POP    $tj,0($sp)
         li      r3,1
+       $POP    r20,`-12*$SIZE_T`($tj)
+       $POP    r21,`-11*$SIZE_T`($tj)
+       $POP    r22,`-10*$SIZE_T`($tj)
+       $POP    r23,`-9*$SIZE_T`($tj)
+       $POP    r24,`-8*$SIZE_T`($tj)
+       $POP    r25,`-7*$SIZE_T`($tj)
+       $POP    r26,`-6*$SIZE_T`($tj)
+       $POP    r27,`-5*$SIZE_T`($tj)
+       $POP    r28,`-4*$SIZE_T`($tj)
+       $POP    r29,`-3*$SIZE_T`($tj)
+       $POP    r30,`-2*$SIZE_T`($tj)
+       $POP    r31,`-1*$SIZE_T`($tj)
+       mr      $sp,$tj
         blr
         .long   0
-.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+       .byte   0,12,4,0,0x80,12,6,0
+       .long   0
+.size  .bn_mul_mont_int,.-.bn_mul_mont_int
+___
+}
+$code.=<<___;
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
  ___
  
  $code =~ s/\`([^\`]*)\`/eval $1/gem;