PPC assembly pack: jumbo update from master.
[openssl.git] / crypto / sha / asm / sha1-ppc.pl
index dcd0fcdfcfa20d8c6d32f9604fd3d5b7d6f9d68b..24a5d065d9fdaac77006afc97a6f64d5e14d81fc 100755 (executable)
@@ -9,8 +9,7 @@
 
 # I let hardware handle unaligned input(*), except on page boundaries
 # (see below for details). Otherwise straightforward implementation
-# with X vector in register bank. The module is big-endian [which is
-# not big deal as there're no little-endian targets left around].
+# with X vector in register bank.
 #
 # (*) this means that this module is inappropriate for PPC403? Does
 #     anybody know if pre-POWER3 can sustain unaligned load?
@@ -24,18 +23,24 @@ $flavour = shift;
 
 if ($flavour =~ /64/) {
        $SIZE_T =8;
+       $LRSAVE =2*$SIZE_T;
        $UCMP   ="cmpld";
        $STU    ="stdu";
        $POP    ="ld";
        $PUSH   ="std";
 } elsif ($flavour =~ /32/) {
        $SIZE_T =4;
+       $LRSAVE =$SIZE_T;
        $UCMP   ="cmplw";
        $STU    ="stwu";
        $POP    ="lwz";
        $PUSH   ="stw";
 } else { die "nonsense $flavour"; }
 
+# Define endianess based on flavour
+# i.e.: linux64le
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -43,7 +48,8 @@ die "can't locate ppc-xlate.pl";
 
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=24*$SIZE_T;
+$FRAME=24*$SIZE_T+64;
+$LOCALS=6*$SIZE_T;
 
 $K  ="r0";
 $sp ="r1";
@@ -65,14 +71,28 @@ $T  ="r12";
 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
     "r24","r25","r26","r27","r28","r29","r30","r31");
 
+sub loadbe {
+my ($dst, $src, $temp_reg) = @_;
+$code.=<<___ if (!$LITTLE_ENDIAN);
+       lwz     $dst,$src
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+       lwz     $temp_reg,$src
+       rotlwi  $dst,$temp_reg,8
+       rlwimi  $dst,$temp_reg,24,0,7
+       rlwimi  $dst,$temp_reg,24,16,23
+___
+}
+
 sub BODY_00_19 {
 my ($i,$a,$b,$c,$d,$e,$f)=@_;
 my $j=$i+1;
-$code.=<<___ if ($i==0);
-       lwz     @X[$i],`$i*4`($inp)
-___
+
+       # Since the last value of $f is discarded, we can use
+       # it as a temp reg to swap byte-order when needed.
+       loadbe("@X[$i]","`$i*4`($inp)",$f) if ($i==0);
+       loadbe("@X[$j]","`$j*4`($inp)",$f) if ($i<15);
 $code.=<<___ if ($i<15);
-       lwz     @X[$j],`$j*4`($inp)
        add     $f,$K,$e
        rotlwi  $e,$a,5
        add     $f,$f,@X[$i]
@@ -162,9 +182,8 @@ $code=<<___;
 .globl .sha1_block_data_order
 .align 4
 .sha1_block_data_order:
+       $STU    $sp,-$FRAME($sp)
        mflr    r0
-       $STU    $sp,`-($FRAME+64)`($sp)
-       $PUSH   r0,`$FRAME-$SIZE_T*18`($sp)
        $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
        $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
        $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
@@ -182,6 +201,7 @@ $code=<<___;
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
        lwz     $A,0($ctx)
        lwz     $B,4($ctx)
        lwz     $C,8($ctx)
@@ -192,37 +212,14 @@ $code=<<___;
 Laligned:
        mtctr   $num
        bl      Lsha1_block_private
-Ldone:
-       $POP    r0,`$FRAME-$SIZE_T*18`($sp)
-       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
-       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
-       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
-       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
-       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
-       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
-       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
-       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
-       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
-       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
-       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
-       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
-       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
-       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
-       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
-       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
-       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
-       mtlr    r0
-       addi    $sp,$sp,`$FRAME+64`
-       blr
-___
+       b       Ldone
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for 64-byte input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for 64-byte input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
 .align 4
 Lunaligned:
        subfic  $t1,$inp,4096
@@ -237,7 +234,7 @@ Lunaligned:
 Lcross_page:
        li      $t1,16
        mtctr   $t1
-       addi    r20,$sp,$FRAME  ; spot below the frame
+       addi    r20,$sp,$LOCALS ; spot within the frame
 Lmemcpy:
        lbz     r16,0($inp)
        lbz     r17,1($inp)
@@ -251,15 +248,40 @@ Lmemcpy:
        addi    r20,r20,4
        bdnz    Lmemcpy
 
-       $PUSH   $inp,`$FRAME-$SIZE_T*19`($sp)
+       $PUSH   $inp,`$FRAME-$SIZE_T*18`($sp)
        li      $t1,1
-       addi    $inp,$sp,$FRAME
+       addi    $inp,$sp,$LOCALS
        mtctr   $t1
        bl      Lsha1_block_private
-       $POP    $inp,`$FRAME-$SIZE_T*19`($sp)
+       $POP    $inp,`$FRAME-$SIZE_T*18`($sp)
        addic.  $num,$num,-1
        bne-    Lunaligned
-       b       Ldone
+
+Ldone:
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
+       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
+       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
+       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
+       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
+       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
+       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
+       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
+       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
+       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
+       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
+       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
+       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
+       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
+       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
+       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
+       mtlr    r0
+       addi    $sp,$sp,$FRAME
+       blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
 ___
 
 # This is private block function, which uses tailored calling
@@ -309,6 +331,9 @@ $code.=<<___;
        addi    $inp,$inp,`16*4`
        bdnz-   Lsha1_block_private
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+.size  .sha1_block_data_order,.-.sha1_block_data_order
 ___
 $code.=<<___;
 .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"