sha/asm/sha1-ppc.pl: add little-endian support.
[openssl.git] / crypto / sha / asm / sha1-ppc.pl
index c1bf1f4872b67c378b6b931f9ac7d796b2bd2ea8..ac7e90d1b51753c4bd1b40a5315d634a9a01cf76 100755 (executable)
@@ -2,41 +2,59 @@
 
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 
-# I let hardware handle unaligned input, except on page boundaries
+# I let hardware handle unaligned input(*), except on page boundaries
 # (see below for details). Otherwise straightforward implementation
 # with X vector in register bank. The module is big-endian [which is
 # not big deal as there're no little-endian targets left around].
+#
+# (*) this means that this module is inappropriate for PPC403? Does
+#     anybody know if pre-POWER3 can sustain unaligned load?
 
-# gcc-4.0.0    -m64    -m32
-# --------------------------
-# sha1         +76%    +59%
+#                      -m64    -m32
+# ----------------------------------
+# PPC970,gcc-4.0.0     +76%    +59%
+# Power6,xlc-7         +68%    +33%
 
-$output = shift;
+$flavour = shift;
 
-if ($output =~ /64\.s/) {
+if ($flavour =~ /64/) {
        $SIZE_T =8;
-       $RZONE  =288;
+       $LRSAVE =2*$SIZE_T;
        $UCMP   ="cmpld";
        $STU    ="stdu";
        $POP    ="ld";
        $PUSH   ="std";
-} elsif ($output =~ /32\.s/) {
+} elsif ($flavour =~ /32/) {
        $SIZE_T =4;
-       $RZONE  =224;
+       $LRSAVE =$SIZE_T;
        $UCMP   ="cmplw";
        $STU    ="stwu";
        $POP    ="lwz";
        $PUSH   ="stw";
-} else { die "nonsense $output"; }
+} else { die "nonsense $flavour"; }
 
-( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
-       die "can't call ../perlasm/ppc-xlate.pl: $!";
+# Define endianess based on flavour
+# i.e.: linux64le
+$LITTLE_ENDIAN=0;
+if ($flavour =~ /le$/) {
+       die "little-endian is 64-bit only: $flavour" if ($SIZE_T == 4);
+       $LITTLE_ENDIAN=1;
+}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
 
-$FRAME=24*$SIZE_T;
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=24*$SIZE_T+64;
+$LOCALS=6*$SIZE_T;
 
 $K  ="r0";
 $sp ="r1";
@@ -58,14 +76,28 @@ $T  ="r12";
 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
     "r24","r25","r26","r27","r28","r29","r30","r31");
 
+sub loadbe {
+my ($dst, $src, $temp_reg) = @_;
+$code.=<<___ if (!$LITTLE_ENDIAN);
+       lwz     $dst,$src
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+       lwz     $temp_reg,$src
+       rotlwi  $dst,$temp_reg,8
+       rlwimi  $dst,$temp_reg,24,0,7
+       rlwimi  $dst,$temp_reg,24,16,23
+___
+}
+
 sub BODY_00_19 {
 my ($i,$a,$b,$c,$d,$e,$f)=@_;
 my $j=$i+1;
-$code.=<<___ if ($i==0);
-       lwz     @X[$i],`$i*4`($inp)
-___
+
+       # Since the last value of $f is discarded, we can use
+       # it as a temp reg to swap byte-order when needed.
+       loadbe("@X[$i]","`$i*4`($inp)",$f) if ($i==0);
+       loadbe("@X[$j]","`$j*4`($inp)",$f) if ($i<15);
 $code.=<<___ if ($i<15);
-       lwz     @X[$j],`$j*4`($inp)
        add     $f,$K,$e
        rotlwi  $e,$a,5
        add     $f,$f,@X[$i]
@@ -149,15 +181,14 @@ ___
 }
 
 $code=<<___;
-.machine any
+.machine       "any"
 .text
 
-.globl .sha1_block_asm_data_order
+.globl .sha1_block_data_order
 .align 4
-.sha1_block_asm_data_order:
+.sha1_block_data_order:
+       $STU    $sp,-$FRAME($sp)
        mflr    r0
-       $STU    $sp,`-($FRAME+64+$RZONE)`($sp)
-       $PUSH   r0,`$FRAME-$SIZE_T*18`($sp)
        $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
        $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
        $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
@@ -175,6 +206,7 @@ $code=<<___;
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
        lwz     $A,0($ctx)
        lwz     $B,4($ctx)
        lwz     $C,8($ctx)
@@ -185,53 +217,29 @@ $code=<<___;
 Laligned:
        mtctr   $num
        bl      Lsha1_block_private
-Ldone:
-       $POP    r0,`$FRAME-$SIZE_T*18`($sp)
-       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
-       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
-       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
-       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
-       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
-       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
-       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
-       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
-       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
-       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
-       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
-       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
-       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
-       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
-       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
-       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
-       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
-       mtlr    r0
-       addi    $sp,$sp,`$FRAME+64+$RZONE`
-       blr
-___
+       b       Ldone
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for 64-byte input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for 64-byte input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
 .align 4
 Lunaligned:
-       li      $t1,4096
-       subf    $t1,$inp,$t1
+       subfic  $t1,$inp,4096
        andi.   $t1,$t1,4095    ; distance to closest page boundary
        srwi.   $t1,$t1,6       ; t1/=64
        beq     Lcross_page
        $UCMP   $num,$t1
        ble-    Laligned        ; didn't cross the page boundary
        mtctr   $t1
-       subf    $num,$t1,$num
+       subfc   $num,$t1,$num
        bl      Lsha1_block_private
 Lcross_page:
        li      $t1,16
        mtctr   $t1
-       addi    r20,$sp,$FRAME  ; spot below the frame
+       addi    r20,$sp,$LOCALS ; spot within the frame
 Lmemcpy:
        lbz     r16,0($inp)
        lbz     r17,1($inp)
@@ -245,15 +253,40 @@ Lmemcpy:
        addi    r20,r20,4
        bdnz    Lmemcpy
 
-       $PUSH   $inp,`$FRAME-$SIZE_T*19`($sp)
+       $PUSH   $inp,`$FRAME-$SIZE_T*18`($sp)
        li      $t1,1
-       addi    $inp,$sp,$FRAME
+       addi    $inp,$sp,$LOCALS
        mtctr   $t1
        bl      Lsha1_block_private
-       $POP    $inp,`$FRAME-$SIZE_T*19`($sp)
+       $POP    $inp,`$FRAME-$SIZE_T*18`($sp)
        addic.  $num,$num,-1
        bne-    Lunaligned
-       b       Ldone
+
+Ldone:
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
+       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
+       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
+       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
+       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
+       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
+       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
+       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
+       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
+       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
+       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
+       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
+       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
+       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
+       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
+       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
+       mtlr    r0
+       addi    $sp,$sp,$FRAME
+       blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
 ___
 
 # This is private block function, which uses tailored calling
@@ -303,6 +336,12 @@ $code.=<<___;
        addi    $inp,$inp,`16*4`
        bdnz-   Lsha1_block_private
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+.size  .sha1_block_data_order,.-.sha1_block_data_order
+___
+$code.=<<___;
+.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;