PPC assembly pack: make new .size directives profiler-friendly.
[openssl.git] / crypto / sha / asm / sha1-ppc.pl
index 2c84d59..8aa5a37 100755 (executable)
 # PPC970,gcc-4.0.0     +76%    +59%
 # Power6,xlc-7         +68%    +33%
 
-$output = shift;
+$flavour = shift;
 
-if ($output =~ /64\.s/) {
+if ($flavour =~ /64/) {
        $SIZE_T =8;
+       $LRSAVE =2*$SIZE_T;
        $UCMP   ="cmpld";
        $STU    ="stdu";
        $POP    ="ld";
        $PUSH   ="std";
-} elsif ($output =~ /32\.s/) {
+} elsif ($flavour =~ /32/) {
        $SIZE_T =4;
+       $LRSAVE =$SIZE_T;
        $UCMP   ="cmplw";
        $STU    ="stwu";
        $POP    ="lwz";
        $PUSH   ="stw";
-} else { die "nonsense $output"; }
+} else { die "nonsense $flavour"; }
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
 die "can't locate ppc-xlate.pl";
 
-( defined shift || open STDOUT,"| $^X $xlate $output" ) ||
-       die "can't call $xlate: $!";
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=24*$SIZE_T;
+$FRAME=24*$SIZE_T+64;
+$LOCALS=6*$SIZE_T;
 
 $K  ="r0";
 $sp ="r1";
@@ -157,14 +159,14 @@ ___
 }
 
 $code=<<___;
+.machine       "any"
 .text
 
 .globl .sha1_block_data_order
 .align 4
 .sha1_block_data_order:
+       $STU    $sp,-$FRAME($sp)
        mflr    r0
-       $STU    $sp,`-($FRAME+64)`($sp)
-       $PUSH   r0,`$FRAME-$SIZE_T*18`($sp)
        $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
        $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
        $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
@@ -182,6 +184,7 @@ $code=<<___;
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
        lwz     $A,0($ctx)
        lwz     $B,4($ctx)
        lwz     $C,8($ctx)
@@ -192,37 +195,14 @@ $code=<<___;
 Laligned:
        mtctr   $num
        bl      Lsha1_block_private
-Ldone:
-       $POP    r0,`$FRAME-$SIZE_T*18`($sp)
-       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
-       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
-       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
-       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
-       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
-       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
-       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
-       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
-       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
-       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
-       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
-       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
-       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
-       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
-       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
-       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
-       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
-       mtlr    r0
-       addi    $sp,$sp,`$FRAME+64`
-       blr
-___
+       b       Ldone
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for 64-byte input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for 64-byte input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
 .align 4
 Lunaligned:
        subfic  $t1,$inp,4096
@@ -237,7 +217,7 @@ Lunaligned:
 Lcross_page:
        li      $t1,16
        mtctr   $t1
-       addi    r20,$sp,$FRAME  ; spot below the frame
+       addi    r20,$sp,$LOCALS ; spot within the frame
 Lmemcpy:
        lbz     r16,0($inp)
        lbz     r17,1($inp)
@@ -251,15 +231,40 @@ Lmemcpy:
        addi    r20,r20,4
        bdnz    Lmemcpy
 
-       $PUSH   $inp,`$FRAME-$SIZE_T*19`($sp)
+       $PUSH   $inp,`$FRAME-$SIZE_T*18`($sp)
        li      $t1,1
-       addi    $inp,$sp,$FRAME
+       addi    $inp,$sp,$LOCALS
        mtctr   $t1
        bl      Lsha1_block_private
-       $POP    $inp,`$FRAME-$SIZE_T*19`($sp)
+       $POP    $inp,`$FRAME-$SIZE_T*18`($sp)
        addic.  $num,$num,-1
        bne-    Lunaligned
-       b       Ldone
+
+Ldone:
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
+       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
+       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
+       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
+       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
+       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
+       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
+       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
+       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
+       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
+       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
+       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
+       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
+       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
+       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
+       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
+       mtlr    r0
+       addi    $sp,$sp,$FRAME
+       blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
 ___
 
 # This is private block function, which uses tailored calling
@@ -309,6 +314,9 @@ $code.=<<___;
        addi    $inp,$inp,`16*4`
        bdnz-   Lsha1_block_private
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+.size  .sha1_block_data_order,.-.sha1_block_data_order
 ___
 $code.=<<___;
 .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"