PPC assembler pack: adhere closer to ABI specs, add PowerOpen traceback data.
authorAndy Polyakov <appro@openssl.org>
Fri, 27 May 2011 13:32:34 +0000 (13:32 +0000)
committerAndy Polyakov <appro@openssl.org>
Fri, 27 May 2011 13:32:34 +0000 (13:32 +0000)
crypto/aes/asm/aes-ppc.pl
crypto/bn/asm/ppc-mont.pl
crypto/bn/asm/ppc.pl
crypto/bn/asm/ppc64-mont.pl
crypto/ppccpuid.pl
crypto/sha/asm/sha1-ppc.pl
crypto/sha/asm/sha512-ppc.pl

index f82c5e1..8cfd423 100644 (file)
@@ -18,7 +18,7 @@
 
 # February 2010
 #
-# Rescheduling instructions to favour Power6 pipeline gives 10%
+# Rescheduling instructions to favour Power6 pipeline gave 10%
 # performance improvement on the platfrom in question (and marginal
 # improvement even on others). It should be noted that Power6 fails
 # to process byte in 18 cycles, only in 23, because it fails to issue
@@ -33,11 +33,13 @@ $flavour = shift;
 
 if ($flavour =~ /64/) {
        $SIZE_T =8;
+       $LRSAVE =2*$SIZE_T;
        $STU    ="stdu";
        $POP    ="ld";
        $PUSH   ="std";
 } elsif ($flavour =~ /32/) {
        $SIZE_T =4;
+       $LRSAVE =$SIZE_T;
        $STU    ="stwu";
        $POP    ="lwz";
        $PUSH   ="stw";
@@ -116,15 +118,19 @@ LAES_Te:
        addi    $Tbl0,$Tbl0,`128-8`
        mtlr    r0
        blr
-       .space  `32-24`
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+       .space  `64-9*4`
 LAES_Td:
        mflr    r0
        bcl     20,31,\$+4
        mflr    $Tbl0   ;    vvvvvvvv "distance" between . and 1st data entry
-       addi    $Tbl0,$Tbl0,`128-8-32+2048+256`
+       addi    $Tbl0,$Tbl0,`128-64-8+2048+256`
        mtlr    r0
        blr
-       .space  `128-32-24`
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+       .space  `128-64-9*4`
 ___
 &_data_word(
        0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
@@ -328,10 +334,9 @@ $code.=<<___;
 .globl .AES_encrypt
 .align 7
 .AES_encrypt:
-       mflr    r0
        $STU    $sp,-$FRAME($sp)
+       mflr    r0
 
-       $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
        $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
        $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
        $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
@@ -352,6 +357,7 @@ $code.=<<___;
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
 
        lwz     $s0,0($inp)
        lwz     $s1,4($inp)
@@ -364,7 +370,7 @@ $code.=<<___;
        stw     $s2,8($out)
        stw     $s3,12($out)
 
-       $POP    r0,`$FRAME-$SIZE_T*21`($sp)
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
        $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
        $POP    r13,`$FRAME-$SIZE_T*19`($sp)
        $POP    r14,`$FRAME-$SIZE_T*18`($sp)
@@ -388,6 +394,9 @@ $code.=<<___;
        mtlr    r0
        addi    $sp,$sp,$FRAME
        blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
 
 .align 5
 Lppc_AES_encrypt:
@@ -530,6 +539,8 @@ Lenc_loop:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .align 4
 Lppc_AES_encrypt_compact:
@@ -673,14 +684,15 @@ Lenc_compact_done:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .globl .AES_decrypt
 .align 7
 .AES_decrypt:
-       mflr    r0
        $STU    $sp,-$FRAME($sp)
+       mflr    r0
 
-       $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
        $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
        $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
        $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
@@ -701,6 +713,7 @@ Lenc_compact_done:
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
 
        lwz     $s0,0($inp)
        lwz     $s1,4($inp)
@@ -713,7 +726,7 @@ Lenc_compact_done:
        stw     $s2,8($out)
        stw     $s3,12($out)
 
-       $POP    r0,`$FRAME-$SIZE_T*21`($sp)
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
        $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
        $POP    r13,`$FRAME-$SIZE_T*19`($sp)
        $POP    r14,`$FRAME-$SIZE_T*18`($sp)
@@ -737,6 +750,9 @@ Lenc_compact_done:
        mtlr    r0
        addi    $sp,$sp,$FRAME
        blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
 
 .align 5
 Lppc_AES_decrypt:
@@ -879,6 +895,8 @@ Ldec_loop:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .align 4
 Lppc_AES_decrypt_compact:
@@ -1179,7 +1197,9 @@ Ldec_compact_done:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
-.long  0
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+
 .asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 .align 7
 ___
index 9257b2c..f9b6992 100644 (file)
@@ -31,7 +31,6 @@ if ($flavour =~ /32/) {
        $BNSZ=  $BITS/8;
        $SIZE_T=4;
        $RZONE= 224;
-       $FRAME= $SIZE_T*16;
 
        $LD=    "lwz";          # load
        $LDU=   "lwzu";         # load and update
@@ -51,7 +50,6 @@ if ($flavour =~ /32/) {
        $BNSZ=  $BITS/8;
        $SIZE_T=8;
        $RZONE= 288;
-       $FRAME= $SIZE_T*16;
 
        # same as above, but 64-bit mnemonics...
        $LD=    "ld";           # load
@@ -69,6 +67,9 @@ if ($flavour =~ /32/) {
        $POP=   $LD;
 } else { die "nonsense $flavour"; }
 
+$FRAME=8*$SIZE_T+$RZONE;
+$LOCALS=8*$SIZE_T;
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -89,18 +90,18 @@ $aj="r10";
 $nj="r11";
 $tj="r12";
 # non-volatile registers
-$i="r14";
-$j="r15";
-$tp="r16";
-$m0="r17";
-$m1="r18";
-$lo0="r19";
-$hi0="r20";
-$lo1="r21";
-$hi1="r22";
-$alo="r23";
-$ahi="r24";
-$nlo="r25";
+$i="r20";
+$j="r21";
+$tp="r22";
+$m0="r23";
+$m1="r24";
+$lo0="r25";
+$hi0="r26";
+$lo1="r27";
+$hi1="r28";
+$alo="r29";
+$ahi="r30";
+$nlo="r31";
 #
 $nhi="r0";
 
@@ -123,32 +124,33 @@ ___
 $code.=<<___;
        slwi    $num,$num,`log($BNSZ)/log(2)`
        li      $tj,-4096
-       addi    $ovf,$num,`$FRAME+$RZONE`
+       addi    $ovf,$num,$FRAME
        subf    $ovf,$ovf,$sp   ; $sp-$ovf
        and     $ovf,$ovf,$tj   ; minimize TLB usage
        subf    $ovf,$sp,$ovf   ; $ovf-$sp
+       mr      $tj,$sp
        srwi    $num,$num,`log($BNSZ)/log(2)`
        $STUX   $sp,$sp,$ovf
 
-       $PUSH   r14,`4*$SIZE_T`($sp)
-       $PUSH   r15,`5*$SIZE_T`($sp)
-       $PUSH   r16,`6*$SIZE_T`($sp)
-       $PUSH   r17,`7*$SIZE_T`($sp)
-       $PUSH   r18,`8*$SIZE_T`($sp)
-       $PUSH   r19,`9*$SIZE_T`($sp)
-       $PUSH   r20,`10*$SIZE_T`($sp)
-       $PUSH   r21,`11*$SIZE_T`($sp)
-       $PUSH   r22,`12*$SIZE_T`($sp)
-       $PUSH   r23,`13*$SIZE_T`($sp)
-       $PUSH   r24,`14*$SIZE_T`($sp)
-       $PUSH   r25,`15*$SIZE_T`($sp)
+       $PUSH   r20,`-12*$SIZE_T`($tj)
+       $PUSH   r21,`-11*$SIZE_T`($tj)
+       $PUSH   r22,`-10*$SIZE_T`($tj)
+       $PUSH   r23,`-9*$SIZE_T`($tj)
+       $PUSH   r24,`-8*$SIZE_T`($tj)
+       $PUSH   r25,`-7*$SIZE_T`($tj)
+       $PUSH   r26,`-6*$SIZE_T`($tj)
+       $PUSH   r27,`-5*$SIZE_T`($tj)
+       $PUSH   r28,`-4*$SIZE_T`($tj)
+       $PUSH   r29,`-3*$SIZE_T`($tj)
+       $PUSH   r30,`-2*$SIZE_T`($tj)
+       $PUSH   r31,`-1*$SIZE_T`($tj)
 
        $LD     $n0,0($n0)      ; pull n0[0] value
        addi    $num,$num,-2    ; adjust $num for counter register
 \f
        $LD     $m0,0($bp)      ; m0=bp[0]
        $LD     $aj,0($ap)      ; ap[0]
-       addi    $tp,$sp,$FRAME
+       addi    $tp,$sp,$LOCALS
        $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[0]
        $UMULH  $hi0,$aj,$m0
 
@@ -210,8 +212,8 @@ L1st:
 Louter:
        $LDX    $m0,$bp,$i      ; m0=bp[i]
        $LD     $aj,0($ap)      ; ap[0]
-       addi    $tp,$sp,$FRAME
-       $LD     $tj,$FRAME($sp) ; tp[0]
+       addi    $tp,$sp,$LOCALS
+       $LD     $tj,$LOCALS($sp); tp[0]
        $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[i]
        $UMULH  $hi0,$aj,$m0
        $LD     $aj,$BNSZ($ap)  ; ap[1]
@@ -278,7 +280,7 @@ Linner:
 \f
        addi    $num,$num,2     ; restore $num
        subfc   $j,$j,$j        ; j=0 and "clear" XER[CA]
-       addi    $tp,$sp,$FRAME
+       addi    $tp,$sp,$LOCALS
        mtctr   $num
 
 .align 4
@@ -304,23 +306,27 @@ Lcopy:                            ; copy or in-place refresh
        addi    $j,$j,$BNSZ
        bdnz-   Lcopy
 
-       $POP    r14,`4*$SIZE_T`($sp)
-       $POP    r15,`5*$SIZE_T`($sp)
-       $POP    r16,`6*$SIZE_T`($sp)
-       $POP    r17,`7*$SIZE_T`($sp)
-       $POP    r18,`8*$SIZE_T`($sp)
-       $POP    r19,`9*$SIZE_T`($sp)
-       $POP    r20,`10*$SIZE_T`($sp)
-       $POP    r21,`11*$SIZE_T`($sp)
-       $POP    r22,`12*$SIZE_T`($sp)
-       $POP    r23,`13*$SIZE_T`($sp)
-       $POP    r24,`14*$SIZE_T`($sp)
-       $POP    r25,`15*$SIZE_T`($sp)
-       $POP    $sp,0($sp)
+       $POP    $tj,0($sp)
        li      r3,1
+       $POP    r20,`-12*$SIZE_T`($tj)
+       $POP    r21,`-11*$SIZE_T`($tj)
+       $POP    r22,`-10*$SIZE_T`($tj)
+       $POP    r23,`-9*$SIZE_T`($tj)
+       $POP    r24,`-8*$SIZE_T`($tj)
+       $POP    r25,`-7*$SIZE_T`($tj)
+       $POP    r26,`-6*$SIZE_T`($tj)
+       $POP    r27,`-5*$SIZE_T`($tj)
+       $POP    r28,`-4*$SIZE_T`($tj)
+       $POP    r29,`-3*$SIZE_T`($tj)
+       $POP    r30,`-2*$SIZE_T`($tj)
+       $POP    r31,`-1*$SIZE_T`($tj)
+       mr      $sp,$tj
        blr
        .long   0
-.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+       .byte   0,12,4,0,0x80,12,6,0
+       .long   0
+
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
index 37c65d3..aaf669a 100644 (file)
@@ -389,7 +389,9 @@ $data=<<EOF;
        $ST             r9,`6*$BNSZ`(r3)        #r[6]=c1
        $ST             r10,`7*$BNSZ`(r3)       #r[7]=c2
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,2,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -814,8 +816,9 @@ $data=<<EOF;
 
 
        blr
-
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,2,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -966,7 +969,9 @@ $data=<<EOF;
        $ST     r10,`6*$BNSZ`(r3)       #r[6]=c1
        $ST     r11,`7*$BNSZ`(r3)       #r[7]=c2
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,3,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1502,7 +1507,9 @@ $data=<<EOF;
        $ST     r12,`14*$BNSZ`(r3)      #r[14]=c3;
        $ST     r10,`15*$BNSZ`(r3)      #r[15]=c1;
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,3,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1550,8 +1557,9 @@ Lppcasm_sub_adios:
        subfze  r3,r0           # if carry bit is set then r3 = 0 else -1
        andi.   r3,r3,1         # keep only last bit.
        blr
-       .long   0x00000000
-
+       .long   0
+       .byte   0,12,0x14,0,0,0,4,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1594,7 +1602,9 @@ Lppcasm_add_mainloop:
 Lppcasm_add_adios:     
        addze   r3,r0                   #return carry bit.
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,4,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1707,7 +1717,9 @@ Lppcasm_div8:
 Lppcasm_div9:
        or      r3,r8,r0
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,3,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1746,8 +1758,9 @@ Lppcasm_sqr_mainloop:
        bdnz-   Lppcasm_sqr_mainloop
 Lppcasm_sqr_adios:     
        blr
-       .long   0x00000000
-
+       .long   0
+       .byte   0,12,0x14,0,0,0,3,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1850,7 +1863,9 @@ Lppcasm_mw_REM:
 Lppcasm_mw_OVER:       
        addi    r3,r12,0
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,4,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1973,7 +1988,9 @@ Lppcasm_maw_leftover:
 Lppcasm_maw_adios:     
        addi    r3,r12,0
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,4,0
+       .long   0
        .align  4
 EOF
 $data =~ s/\`([^\`]*)\`/eval $1/gem;
index f040466..a14e769 100644 (file)
@@ -70,7 +70,6 @@ $flavour = shift;
 if ($flavour =~ /32/) {
        $SIZE_T=4;
        $RZONE= 224;
-       $FRAME= $SIZE_T*12+8*12;
        $fname= "bn_mul_mont_fpu64";
 
        $STUX=  "stwux";        # store indexed and update
@@ -79,7 +78,6 @@ if ($flavour =~ /32/) {
 } elsif ($flavour =~ /64/) {
        $SIZE_T=8;
        $RZONE= 288;
-       $FRAME= $SIZE_T*12+8*12;
        $fname= "bn_mul_mont_fpu64";
 
        # same as above, but 64-bit mnemonics...
@@ -95,7 +93,7 @@ die "can't locate ppc-xlate.pl";
 
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=($FRAME+63)&~63;
+$FRAME=64;     # padded frame header
 $TRANSFER=16*8;
 
 $carry="r0";
@@ -112,16 +110,16 @@ $tp="r10";
 $j="r11";
 $i="r12";
 # non-volatile registers
-$nap_d="r14";  # interleaved ap and np in double format
-$a0="r15";     # ap[0]
-$t0="r16";     # temporary registers
-$t1="r17";
-$t2="r18";
-$t3="r19";
-$t4="r20";
-$t5="r21";
-$t6="r22";
-$t7="r23";
+$nap_d="r22";  # interleaved ap and np in double format
+$a0="r23";     # ap[0]
+$t0="r24";     # temporary registers
+$t1="r25";
+$t2="r26";
+$t3="r27";
+$t4="r28";
+$t5="r29";
+$t6="r30";
+$t7="r31";
 
 # PPC offers enough register bank capacity to unroll inner loops twice
 #
@@ -151,28 +149,17 @@ $ba="f0"; $bb="f1";       $bc="f2";       $bd="f3";
 $na="f4";      $nb="f5";       $nc="f6";       $nd="f7";
 $dota="f8";    $dotb="f9";
 $A0="f10";     $A1="f11";      $A2="f12";      $A3="f13";
-$N0="f14";     $N1="f15";      $N2="f16";      $N3="f17";
-$T0a="f18";    $T0b="f19";
-$T1a="f20";    $T1b="f21";
-$T2a="f22";    $T2b="f23";
-$T3a="f24";    $T3b="f25";
+$N0="f20";     $N1="f21";      $N2="f22";      $N3="f23";
+$T0a="f24";    $T0b="f25";
+$T1a="f26";    $T1b="f27";
+$T2a="f28";    $T2b="f29";
+$T3a="f30";    $T3b="f31";
 \f
 # sp----------->+-------------------------------+
 #              | saved sp                      |
 #              +-------------------------------+
-#              |                               |
-#              +-------------------------------+
-#              | 10 saved gpr, r14-r23         |
-#              .                               .
-#              .                               .
-#   +12*size_t +-------------------------------+
-#              | 12 saved fpr, f14-f25         |
-#              .                               .
 #              .                               .
-#   +12*8      +-------------------------------+
-#              | padding to 64 byte boundary   |
-#              .                               .
-#   +X         +-------------------------------+
+#   +64                +-------------------------------+
 #              | 16 gpr<->fpr transfer zone    |
 #              .                               .
 #              .                               .
@@ -192,6 +179,16 @@ $T3a="f24";        $T3b="f25";
 #              .                               .
 #              .                               .
 #              +-------------------------------+
+#              .                               .
+#   -12*size_t +-------------------------------+
+#              | 10 saved gpr, r22-r31         |
+#              .                               .
+#              .                               .
+#   -12*8      +-------------------------------+
+#              | 12 saved fpr, f20-f31         |
+#              .                               .
+#              .                               .
+#              +-------------------------------+
 \f
 $code=<<___;
 .machine "any"
@@ -215,30 +212,31 @@ $code=<<___;
        subf    $tp,$tp,$sp     ; $sp-$tp
        and     $tp,$tp,$i      ; minimize TLB usage
        subf    $tp,$sp,$tp     ; $tp-$sp
+       mr      $i,$sp
        $STUX   $sp,$sp,$tp     ; alloca
 
-       $PUSH   r14,`2*$SIZE_T`($sp)
-       $PUSH   r15,`3*$SIZE_T`($sp)
-       $PUSH   r16,`4*$SIZE_T`($sp)
-       $PUSH   r17,`5*$SIZE_T`($sp)
-       $PUSH   r18,`6*$SIZE_T`($sp)
-       $PUSH   r19,`7*$SIZE_T`($sp)
-       $PUSH   r20,`8*$SIZE_T`($sp)
-       $PUSH   r21,`9*$SIZE_T`($sp)
-       $PUSH   r22,`10*$SIZE_T`($sp)
-       $PUSH   r23,`11*$SIZE_T`($sp)
-       stfd    f14,`12*$SIZE_T+0`($sp)
-       stfd    f15,`12*$SIZE_T+8`($sp)
-       stfd    f16,`12*$SIZE_T+16`($sp)
-       stfd    f17,`12*$SIZE_T+24`($sp)
-       stfd    f18,`12*$SIZE_T+32`($sp)
-       stfd    f19,`12*$SIZE_T+40`($sp)
-       stfd    f20,`12*$SIZE_T+48`($sp)
-       stfd    f21,`12*$SIZE_T+56`($sp)
-       stfd    f22,`12*$SIZE_T+64`($sp)
-       stfd    f23,`12*$SIZE_T+72`($sp)
-       stfd    f24,`12*$SIZE_T+80`($sp)
-       stfd    f25,`12*$SIZE_T+88`($sp)
+       $PUSH   r22,`-12*8-10*$SIZE_T`($i)
+       $PUSH   r23,`-12*8-9*$SIZE_T`($i)
+       $PUSH   r24,`-12*8-8*$SIZE_T`($i)
+       $PUSH   r25,`-12*8-7*$SIZE_T`($i)
+       $PUSH   r26,`-12*8-6*$SIZE_T`($i)
+       $PUSH   r27,`-12*8-5*$SIZE_T`($i)
+       $PUSH   r28,`-12*8-4*$SIZE_T`($i)
+       $PUSH   r29,`-12*8-3*$SIZE_T`($i)
+       $PUSH   r30,`-12*8-2*$SIZE_T`($i)
+       $PUSH   r31,`-12*8-1*$SIZE_T`($i)
+       stfd    f20,`-12*8`($i)
+       stfd    f21,`-11*8`($i)
+       stfd    f22,`-10*8`($i)
+       stfd    f23,`-9*8`($i)
+       stfd    f24,`-8*8`($i)
+       stfd    f25,`-7*8`($i)
+       stfd    f26,`-6*8`($i)
+       stfd    f27,`-5*8`($i)
+       stfd    f28,`-4*8`($i)
+       stfd    f29,`-3*8`($i)
+       stfd    f30,`-2*8`($i)
+       stfd    f31,`-1*8`($i)
 ___
 $code.=<<___ if ($SIZE_T==8);
        ld      $a0,0($ap)      ; pull ap[0] value
@@ -1052,33 +1050,37 @@ Lcopy:                          ; copy or in-place refresh
 ___
 \f
 $code.=<<___;
-       $POP    r14,`2*$SIZE_T`($sp)
-       $POP    r15,`3*$SIZE_T`($sp)
-       $POP    r16,`4*$SIZE_T`($sp)
-       $POP    r17,`5*$SIZE_T`($sp)
-       $POP    r18,`6*$SIZE_T`($sp)
-       $POP    r19,`7*$SIZE_T`($sp)
-       $POP    r20,`8*$SIZE_T`($sp)
-       $POP    r21,`9*$SIZE_T`($sp)
-       $POP    r22,`10*$SIZE_T`($sp)
-       $POP    r23,`11*$SIZE_T`($sp)
-       lfd     f14,`12*$SIZE_T+0`($sp)
-       lfd     f15,`12*$SIZE_T+8`($sp)
-       lfd     f16,`12*$SIZE_T+16`($sp)
-       lfd     f17,`12*$SIZE_T+24`($sp)
-       lfd     f18,`12*$SIZE_T+32`($sp)
-       lfd     f19,`12*$SIZE_T+40`($sp)
-       lfd     f20,`12*$SIZE_T+48`($sp)
-       lfd     f21,`12*$SIZE_T+56`($sp)
-       lfd     f22,`12*$SIZE_T+64`($sp)
-       lfd     f23,`12*$SIZE_T+72`($sp)
-       lfd     f24,`12*$SIZE_T+80`($sp)
-       lfd     f25,`12*$SIZE_T+88`($sp)
-       $POP    $sp,0($sp)
+       $POP    $i,0($sp)
        li      r3,1    ; signal "handled"
+       $POP    r22,`-12*8-10*$SIZE_T`($i)
+       $POP    r23,`-12*8-9*$SIZE_T`($i)
+       $POP    r24,`-12*8-8*$SIZE_T`($i)
+       $POP    r25,`-12*8-7*$SIZE_T`($i)
+       $POP    r26,`-12*8-6*$SIZE_T`($i)
+       $POP    r27,`-12*8-5*$SIZE_T`($i)
+       $POP    r28,`-12*8-4*$SIZE_T`($i)
+       $POP    r29,`-12*8-3*$SIZE_T`($i)
+       $POP    r30,`-12*8-2*$SIZE_T`($i)
+       $POP    r31,`-12*8-1*$SIZE_T`($i)
+       lfd     f20,`-12*8`($i)
+       lfd     f21,`-11*8`($i)
+       lfd     f22,`-10*8`($i)
+       lfd     f23,`-9*8`($i)
+       lfd     f24,`-8*8`($i)
+       lfd     f25,`-7*8`($i)
+       lfd     f26,`-6*8`($i)
+       lfd     f27,`-5*8`($i)
+       lfd     f28,`-4*8`($i)
+       lfd     f29,`-3*8`($i)
+       lfd     f30,`-2*8`($i)
+       lfd     f31,`-1*8`($i)
+       mr      $sp,$i
        blr
        .long   0
-.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+       .byte   0,12,4,0,0x8c,10,6,0
+       .long   0
+
+.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
index d6220e7..3bdfff3 100755 (executable)
@@ -29,12 +29,16 @@ $code=<<___;
        fcfid   f1,f1
        extrdi  r0,r0,32,0
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .globl .OPENSSL_altivec_probe
 .align 4
 .OPENSSL_altivec_probe:
        .long   0x10000484      # vor   v0,v0,v0
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .globl .OPENSSL_wipe_cpu
 .align 4
@@ -65,6 +69,8 @@ $code=<<___;
        fmr     f12,f31
        fmr     f13,f31
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .globl .OPENSSL_atomic_add
 .align 4
@@ -75,6 +81,9 @@ Ladd: lwarx   r5,0,r3
        bne-    Ladd
        $SIGNX  r3,r0
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,2,0
+       .long   0
 
 .globl .OPENSSL_rdtsc
 .align 4
@@ -82,6 +91,8 @@ Ladd: lwarx   r5,0,r3
        mftb    r3
        mftbu   r4
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .globl .OPENSSL_cleanse
 .align 4
@@ -111,6 +122,9 @@ Laligned:
        andi.   r4,r4,3
        bne     Little
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,2,0
+       .long   0
 ___
 {
 my ($out,$cnt,$max)=("r3","r4","r5");
@@ -145,6 +159,9 @@ Loop:       mftb    $tick
 
        mr      r3,$cnt
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,2,0
+       .long   0
 
 .globl .OPENSSL_instrument_bus2
 .align 4
@@ -193,6 +210,9 @@ Ldone2:
        srwi    $cnt,$cnt,2
        sub     r3,r0,$cnt
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,3,0
+       .long   0
 ___
 }
 
index dcd0fcd..2140dd2 100755 (executable)
@@ -24,12 +24,14 @@ $flavour = shift;
 
 if ($flavour =~ /64/) {
        $SIZE_T =8;
+       $LRSAVE =2*$SIZE_T;
        $UCMP   ="cmpld";
        $STU    ="stdu";
        $POP    ="ld";
        $PUSH   ="std";
 } elsif ($flavour =~ /32/) {
        $SIZE_T =4;
+       $LRSAVE =$SIZE_T;
        $UCMP   ="cmplw";
        $STU    ="stwu";
        $POP    ="lwz";
@@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl";
 
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=24*$SIZE_T;
+$FRAME=24*$SIZE_T+64;
+$LOCALS=6*$SIZE_T;
 
 $K  ="r0";
 $sp ="r1";
@@ -162,9 +165,8 @@ $code=<<___;
 .globl .sha1_block_data_order
 .align 4
 .sha1_block_data_order:
+       $STU    $sp,-$FRAME($sp)
        mflr    r0
-       $STU    $sp,`-($FRAME+64)`($sp)
-       $PUSH   r0,`$FRAME-$SIZE_T*18`($sp)
        $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
        $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
        $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
@@ -182,6 +184,7 @@ $code=<<___;
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
        lwz     $A,0($ctx)
        lwz     $B,4($ctx)
        lwz     $C,8($ctx)
@@ -192,37 +195,14 @@ $code=<<___;
 Laligned:
        mtctr   $num
        bl      Lsha1_block_private
-Ldone:
-       $POP    r0,`$FRAME-$SIZE_T*18`($sp)
-       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
-       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
-       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
-       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
-       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
-       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
-       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
-       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
-       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
-       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
-       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
-       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
-       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
-       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
-       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
-       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
-       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
-       mtlr    r0
-       addi    $sp,$sp,`$FRAME+64`
-       blr
-___
+       b       Ldone
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for 64-byte input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for 64-byte input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
 .align 4
 Lunaligned:
        subfic  $t1,$inp,4096
@@ -237,7 +217,7 @@ Lunaligned:
 Lcross_page:
        li      $t1,16
        mtctr   $t1
-       addi    r20,$sp,$FRAME  ; spot below the frame
+       addi    r20,$sp,$LOCALS ; spot within the frame
 Lmemcpy:
        lbz     r16,0($inp)
        lbz     r17,1($inp)
@@ -251,15 +231,40 @@ Lmemcpy:
        addi    r20,r20,4
        bdnz    Lmemcpy
 
-       $PUSH   $inp,`$FRAME-$SIZE_T*19`($sp)
+       $PUSH   $inp,`$FRAME-$SIZE_T*18`($sp)
        li      $t1,1
-       addi    $inp,$sp,$FRAME
+       addi    $inp,$sp,$LOCALS
        mtctr   $t1
        bl      Lsha1_block_private
-       $POP    $inp,`$FRAME-$SIZE_T*19`($sp)
+       $POP    $inp,`$FRAME-$SIZE_T*18`($sp)
        addic.  $num,$num,-1
        bne-    Lunaligned
-       b       Ldone
+
+Ldone:
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
+       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
+       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
+       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
+       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
+       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
+       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
+       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
+       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
+       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
+       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
+       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
+       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
+       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
+       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
+       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
+       mtlr    r0
+       addi    $sp,$sp,$FRAME
+       blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
 ___
 
 # This is private block function, which uses tailored calling
@@ -309,6 +314,8 @@ $code.=<<___;
        addi    $inp,$inp,`16*4`
        bdnz-   Lsha1_block_private
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 ___
 $code.=<<___;
 .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
index 768a6a6..6b44a68 100755 (executable)
@@ -40,6 +40,7 @@ $output =shift;
 
 if ($flavour =~ /64/) {
        $SIZE_T=8;
+       $LRSAVE=2*$SIZE_T;
        $STU="stdu";
        $UCMP="cmpld";
        $SHL="sldi";
@@ -47,6 +48,7 @@ if ($flavour =~ /64/) {
        $PUSH="std";
 } elsif ($flavour =~ /32/) {
        $SIZE_T=4;
+       $LRSAVE=$SIZE_T;
        $STU="stwu";
        $UCMP="cmplw";
        $SHL="slwi";
@@ -87,7 +89,8 @@ if ($output =~ /512/) {
        $SHR="srwi";
 }
 
-$FRAME=32*$SIZE_T;
+$FRAME=32*$SIZE_T+16*$SZ;
+$LOCALS=6*$SIZE_T;
 
 $sp ="r1";
 $toc="r2";
@@ -179,13 +182,12 @@ $code=<<___;
 .globl $func
 .align 6
 $func:
+       $STU    $sp,-$FRAME($sp)
        mflr    r0
-       $STU    $sp,`-($FRAME+16*$SZ)`($sp)
        $SHL    $num,$num,`log(16*$SZ)/log(2)`
 
        $PUSH   $ctx,`$FRAME-$SIZE_T*22`($sp)
 
-       $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
        $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
        $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
        $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
@@ -206,6 +208,7 @@ $func:
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
 
        $LD     $A,`0*$SZ`($ctx)
        mr      $inp,r4                         ; incarnate $inp
@@ -217,7 +220,7 @@ $func:
        $LD     $G,`6*$SZ`($ctx)
        $LD     $H,`7*$SZ`($ctx)
 
-       b       LPICmeup
+       bl      LPICmeup
 LPICedup:
        andi.   r0,$inp,3
        bne     Lunaligned
@@ -226,40 +229,14 @@ Laligned:
        $PUSH   $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
        $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
        bl      Lsha2_block_private
-Ldone:
-       $POP    r0,`$FRAME-$SIZE_T*21`($sp)
-       $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
-       $POP    r13,`$FRAME-$SIZE_T*19`($sp)
-       $POP    r14,`$FRAME-$SIZE_T*18`($sp)
-       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
-       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
-       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
-       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
-       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
-       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
-       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
-       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
-       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
-       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
-       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
-       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
-       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
-       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
-       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
-       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
-       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
-       mtlr    r0
-       addi    $sp,$sp,`$FRAME+16*$SZ`
-       blr
-___
+       b       Ldone
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for the input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for the input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
 .align 4
 Lunaligned:
        subfic  $t1,$inp,4096
@@ -278,7 +255,7 @@ Lunaligned:
 Lcross_page:
        li      $t1,`16*$SZ/4`
        mtctr   $t1
-       addi    r20,$sp,$FRAME                  ; aligned spot below the frame
+       addi    r20,$sp,$LOCALS                 ; aligned spot below the frame
 Lmemcpy:
        lbz     r16,0($inp)
        lbz     r17,1($inp)
@@ -293,8 +270,8 @@ Lmemcpy:
        bdnz    Lmemcpy
 
        $PUSH   $inp,`$FRAME-$SIZE_T*26`($sp)   ; save real inp
-       addi    $t1,$sp,`$FRAME+16*$SZ`         ; fictitious end pointer
-       addi    $inp,$sp,$FRAME                 ; fictitious inp pointer
+       addi    $t1,$sp,`$LOCALS+16*$SZ`        ; fictitious end pointer
+       addi    $inp,$sp,$LOCALS                ; fictitious inp pointer
        $PUSH   $num,`$FRAME-$SIZE_T*25`($sp)   ; save real num
        $PUSH   $t1,`$FRAME-$SIZE_T*24`($sp)    ; end pointer
        $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
@@ -303,10 +280,36 @@ Lmemcpy:
        $POP    $num,`$FRAME-$SIZE_T*25`($sp)   ; restore real num
        addic.  $num,$num,`-16*$SZ`             ; num--
        bne-    Lunaligned
-       b       Ldone
-___
 
-$code.=<<___;
+Ldone:
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
+       $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
+       $POP    r13,`$FRAME-$SIZE_T*19`($sp)
+       $POP    r14,`$FRAME-$SIZE_T*18`($sp)
+       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
+       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
+       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
+       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
+       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
+       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
+       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
+       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
+       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
+       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
+       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
+       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
+       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
+       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
+       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
+       mtlr    r0
+       addi    $sp,$sp,$FRAME
+       blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
+
 .align 4
 Lsha2_block_private:
 ___
@@ -372,6 +375,8 @@ $code.=<<___;
        $ST     $H,`7*$SZ`($ctx)
        bne     Lsha2_block_private
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 ___
 
 # Ugly hack here, because PPC assembler syntax seem to vary too
@@ -379,22 +384,15 @@ ___
 $code.=<<___;
 .align 6
 LPICmeup:
-       bl      LPIC
-       addi    $Tbl,$Tbl,`64-4`        ; "distance" between . and last nop
-       b       LPICedup
-       nop
-       nop
-       nop
-       nop
-       nop
-LPIC:  mflr    $Tbl
+       mflr    r0
+       bcl     20,31,\$+4
+       mflr    $Tbl    ; vvvvvv "distance" between . and 1st data entry
+       addi    $Tbl,$Tbl,`64-8`
+       mtlr    r0
        blr
-       nop
-       nop
-       nop
-       nop
-       nop
-       nop
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+       .space  `64-9*4`
 ___
 $code.=<<___ if ($SZ==8);
        .long   0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd