PPC assembly pack: jumbo update from master.

[openssl.git] / crypto / sha / asm / sha1-ppc.pl
diff --git a/crypto/sha/asm/sha1-ppc.pl b/crypto/sha/asm/sha1-ppc.pl

index dcd0fcdfcfa20d8c6d32f9604fd3d5b7d6f9d68b..24a5d065d9fdaac77006afc97a6f64d5e14d81fc 100755 (executable)
--- a/crypto/sha/asm/sha1-ppc.pl
+++ b/crypto/sha/asm/sha1-ppc.pl
@@ -9,8 +9,7 @@
  
  # I let hardware handle unaligned input(*), except on page boundaries
  # (see below for details). Otherwise straightforward implementation
-# with X vector in register bank. The module is big-endian [which is
-# not big deal as there're no little-endian targets left around].
+# with X vector in register bank.
  #
  # (*) this means that this module is inappropriate for PPC403? Does
  #     anybody know if pre-POWER3 can sustain unaligned load?
@@ -24,18 +23,24 @@ $flavour = shift;
  
  if ($flavour =~ /64/) {
         $SIZE_T =8;
+       $LRSAVE =2*$SIZE_T;
         $UCMP   ="cmpld";
         $STU    ="stdu";
         $POP    ="ld";
         $PUSH   ="std";
  } elsif ($flavour =~ /32/) {
         $SIZE_T =4;
+       $LRSAVE =$SIZE_T;
         $UCMP   ="cmplw";
         $STU    ="stwu";
         $POP    ="lwz";
         $PUSH   ="stw";
  } else { die "nonsense $flavour"; }
  
+# Define endianess based on flavour
+# i.e.: linux64le
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
  $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -43,7 +48,8 @@ die "can't locate ppc-xlate.pl";
  
  open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  
-$FRAME=24*$SIZE_T;
+$FRAME=24*$SIZE_T+64;
+$LOCALS=6*$SIZE_T;
  
  $K  ="r0";
  $sp ="r1";
@@ -65,14 +71,28 @@ $T  ="r12";
  @X=("r16","r17","r18","r19","r20","r21","r22","r23",
      "r24","r25","r26","r27","r28","r29","r30","r31");
  
+sub loadbe {
+my ($dst, $src, $temp_reg) = @_;
+$code.=<<___ if (!$LITTLE_ENDIAN);
+       lwz     $dst,$src
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+       lwz     $temp_reg,$src
+       rotlwi  $dst,$temp_reg,8
+       rlwimi  $dst,$temp_reg,24,0,7
+       rlwimi  $dst,$temp_reg,24,16,23
+___
+}
+
  sub BODY_00_19 {
  my ($i,$a,$b,$c,$d,$e,$f)=@_;
  my $j=$i+1;
-$code.=<<___ if ($i==0);
-       lwz     @X[$i],`$i*4`($inp)
-___
+
+       # Since the last value of $f is discarded, we can use
+       # it as a temp reg to swap byte-order when needed.
+       loadbe("@X[$i]","`$i*4`($inp)",$f) if ($i==0);
+       loadbe("@X[$j]","`$j*4`($inp)",$f) if ($i<15);
  $code.=<<___ if ($i<15);
-       lwz     @X[$j],`$j*4`($inp)
         add     $f,$K,$e
         rotlwi  $e,$a,5
         add     $f,$f,@X[$i]
@@ -162,9 +182,8 @@ $code=<<___;
  .globl .sha1_block_data_order
  .align 4
  .sha1_block_data_order:
+       $STU    $sp,-$FRAME($sp)
         mflr    r0
-       $STU    $sp,`-($FRAME+64)`($sp)
-       $PUSH   r0,`$FRAME-$SIZE_T*18`($sp)
         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
@@ -182,6 +201,7 @@ $code=<<___;
         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
         lwz     $A,0($ctx)
         lwz     $B,4($ctx)
         lwz     $C,8($ctx)
@@ -192,37 +212,14 @@ $code=<<___;
  Laligned:
         mtctr   $num
         bl      Lsha1_block_private
-Ldone:
-       $POP    r0,`$FRAME-$SIZE_T*18`($sp)
-       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
-       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
-       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
-       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
-       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
-       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
-       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
-       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
-       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
-       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
-       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
-       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
-       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
-       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
-       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
-       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
-       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
-       mtlr    r0
-       addi    $sp,$sp,`$FRAME+64`
-       blr
-___
+       b       Ldone
  
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for 64-byte input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for 64-byte input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
  .align 4
  Lunaligned:
         subfic  $t1,$inp,4096
@@ -237,7 +234,7 @@ Lunaligned:
  Lcross_page:
         li      $t1,16
         mtctr   $t1
-       addi    r20,$sp,$FRAME  ; spot below the frame
+       addi    r20,$sp,$LOCALS ; spot within the frame
  Lmemcpy:
         lbz     r16,0($inp)
         lbz     r17,1($inp)
@@ -251,15 +248,40 @@ Lmemcpy:
         addi    r20,r20,4
         bdnz    Lmemcpy
  
-       $PUSH   $inp,`$FRAME-$SIZE_T*19`($sp)
+       $PUSH   $inp,`$FRAME-$SIZE_T*18`($sp)
         li      $t1,1
-       addi    $inp,$sp,$FRAME
+       addi    $inp,$sp,$LOCALS
         mtctr   $t1
         bl      Lsha1_block_private
-       $POP    $inp,`$FRAME-$SIZE_T*19`($sp)
+       $POP    $inp,`$FRAME-$SIZE_T*18`($sp)
         addic.  $num,$num,-1
         bne-    Lunaligned
-       b       Ldone
+
+Ldone:
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
+       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
+       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
+       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
+       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
+       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
+       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
+       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
+       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
+       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
+       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
+       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
+       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
+       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
+       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
+       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
+       mtlr    r0
+       addi    $sp,$sp,$FRAME
+       blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
  ___
  
  # This is private block function, which uses tailored calling
@@ -309,6 +331,9 @@ $code.=<<___;
         addi    $inp,$inp,`16*4`
         bdnz-   Lsha1_block_private
         blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+.size  .sha1_block_data_order,.-.sha1_block_data_order
  ___
  $code.=<<___;
  .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"