PPC assembler pack update from HEAD.
authorAndy Polyakov <appro@openssl.org>
Mon, 14 Nov 2011 20:54:17 +0000 (20:54 +0000)
committerAndy Polyakov <appro@openssl.org>
Mon, 14 Nov 2011 20:54:17 +0000 (20:54 +0000)
crypto/aes/asm/aes-ppc.pl
crypto/bn/asm/ppc-mont.pl
crypto/bn/asm/ppc.pl
crypto/bn/asm/ppc64-mont.pl
crypto/ppccap.c [new file with mode: 0644]
crypto/ppccpuid.pl
crypto/sha/asm/sha1-ppc.pl
crypto/sha/asm/sha512-ppc.pl

index f82c5e1..7c52cbe 100644 (file)
@@ -7,7 +7,7 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 
-# Needs more work: key setup, page boundaries, CBC routine...
+# Needs more work: key setup, CBC routine...
 #
 # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
 # 128-bit key, which is ~40% better than 64-bit code generated by gcc
@@ -18,7 +18,7 @@
 
 # February 2010
 #
-# Rescheduling instructions to favour Power6 pipeline gives 10%
+# Rescheduling instructions to favour Power6 pipeline gave 10%
 # performance improvement on the platfrom in question (and marginal
 # improvement even on others). It should be noted that Power6 fails
 # to process byte in 18 cycles, only in 23, because it fails to issue
@@ -33,11 +33,13 @@ $flavour = shift;
 
 if ($flavour =~ /64/) {
        $SIZE_T =8;
+       $LRSAVE =2*$SIZE_T;
        $STU    ="stdu";
        $POP    ="ld";
        $PUSH   ="std";
 } elsif ($flavour =~ /32/) {
        $SIZE_T =4;
+       $LRSAVE =$SIZE_T;
        $STU    ="stwu";
        $POP    ="lwz";
        $PUSH   ="stw";
@@ -116,15 +118,19 @@ LAES_Te:
        addi    $Tbl0,$Tbl0,`128-8`
        mtlr    r0
        blr
-       .space  `32-24`
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+       .space  `64-9*4`
 LAES_Td:
        mflr    r0
        bcl     20,31,\$+4
        mflr    $Tbl0   ;    vvvvvvvv "distance" between . and 1st data entry
-       addi    $Tbl0,$Tbl0,`128-8-32+2048+256`
+       addi    $Tbl0,$Tbl0,`128-64-8+2048+256`
        mtlr    r0
        blr
-       .space  `128-32-24`
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+       .space  `128-64-9*4`
 ___
 &_data_word(
        0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
@@ -328,10 +334,9 @@ $code.=<<___;
 .globl .AES_encrypt
 .align 7
 .AES_encrypt:
-       mflr    r0
        $STU    $sp,-$FRAME($sp)
+       mflr    r0
 
-       $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
        $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
        $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
        $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
@@ -352,7 +357,14 @@ $code.=<<___;
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
+
+       andi.   $t0,$inp,3
+       andi.   $t1,$out,3
+       or.     $t0,$t0,$t1
+       bne     Lenc_unaligned
 
+Lenc_unaligned_ok:
        lwz     $s0,0($inp)
        lwz     $s1,4($inp)
        lwz     $s2,8($inp)
@@ -363,8 +375,80 @@ $code.=<<___;
        stw     $s1,4($out)
        stw     $s2,8($out)
        stw     $s3,12($out)
+       b       Lenc_done
+
+Lenc_unaligned:
+       subfic  $t0,$inp,4096
+       subfic  $t1,$out,4096
+       andi.   $t0,$t0,4096-16
+       beq     Lenc_xpage
+       andi.   $t1,$t1,4096-16
+       bne     Lenc_unaligned_ok
+
+Lenc_xpage:
+       lbz     $acc00,0($inp)
+       lbz     $acc01,1($inp)
+       lbz     $acc02,2($inp)
+       lbz     $s0,3($inp)
+       lbz     $acc04,4($inp)
+       lbz     $acc05,5($inp)
+       lbz     $acc06,6($inp)
+       lbz     $s1,7($inp)
+       lbz     $acc08,8($inp)
+       lbz     $acc09,9($inp)
+       lbz     $acc10,10($inp)
+       insrwi  $s0,$acc00,8,0
+       lbz     $s2,11($inp)
+       insrwi  $s1,$acc04,8,0
+       lbz     $acc12,12($inp)
+       insrwi  $s0,$acc01,8,8
+       lbz     $acc13,13($inp)
+       insrwi  $s1,$acc05,8,8
+       lbz     $acc14,14($inp)
+       insrwi  $s0,$acc02,8,16
+       lbz     $s3,15($inp)
+       insrwi  $s1,$acc06,8,16
+       insrwi  $s2,$acc08,8,0
+       insrwi  $s3,$acc12,8,0
+       insrwi  $s2,$acc09,8,8
+       insrwi  $s3,$acc13,8,8
+       insrwi  $s2,$acc10,8,16
+       insrwi  $s3,$acc14,8,16
+
+       bl      LAES_Te
+       bl      Lppc_AES_encrypt_compact
+
+       extrwi  $acc00,$s0,8,0
+       extrwi  $acc01,$s0,8,8
+       stb     $acc00,0($out)
+       extrwi  $acc02,$s0,8,16
+       stb     $acc01,1($out)
+       stb     $acc02,2($out)
+       extrwi  $acc04,$s1,8,0
+       stb     $s0,3($out)
+       extrwi  $acc05,$s1,8,8
+       stb     $acc04,4($out)
+       extrwi  $acc06,$s1,8,16
+       stb     $acc05,5($out)
+       stb     $acc06,6($out)
+       extrwi  $acc08,$s2,8,0
+       stb     $s1,7($out)
+       extrwi  $acc09,$s2,8,8
+       stb     $acc08,8($out)
+       extrwi  $acc10,$s2,8,16
+       stb     $acc09,9($out)
+       stb     $acc10,10($out)
+       extrwi  $acc12,$s3,8,0
+       stb     $s2,11($out)
+       extrwi  $acc13,$s3,8,8
+       stb     $acc12,12($out)
+       extrwi  $acc14,$s3,8,16
+       stb     $acc13,13($out)
+       stb     $acc14,14($out)
+       stb     $s3,15($out)
 
-       $POP    r0,`$FRAME-$SIZE_T*21`($sp)
+Lenc_done:
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
        $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
        $POP    r13,`$FRAME-$SIZE_T*19`($sp)
        $POP    r14,`$FRAME-$SIZE_T*18`($sp)
@@ -388,18 +472,21 @@ $code.=<<___;
        mtlr    r0
        addi    $sp,$sp,$FRAME
        blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
 
 .align 5
 Lppc_AES_encrypt:
        lwz     $acc00,240($key)
-       lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
        addi    $Tbl1,$Tbl0,3
+       lwz     $t0,0($key)
        addi    $Tbl2,$Tbl0,2
+       lwz     $t1,4($key)
        addi    $Tbl3,$Tbl0,1
+       lwz     $t2,8($key)
        addi    $acc00,$acc00,-1
+       lwz     $t3,12($key)
        addi    $key,$key,16
        xor     $s0,$s0,$t0
        xor     $s1,$s1,$t1
@@ -413,44 +500,44 @@ Lenc_loop:
        rlwinm  $acc02,$s2,`32-24+3`,21,28
        rlwinm  $acc03,$s3,`32-24+3`,21,28
        lwz     $t0,0($key)
-       lwz     $t1,4($key)
        rlwinm  $acc04,$s1,`32-16+3`,21,28
+       lwz     $t1,4($key)
        rlwinm  $acc05,$s2,`32-16+3`,21,28
        lwz     $t2,8($key)
-       lwz     $t3,12($key)
        rlwinm  $acc06,$s3,`32-16+3`,21,28
+       lwz     $t3,12($key)
        rlwinm  $acc07,$s0,`32-16+3`,21,28
        lwzx    $acc00,$Tbl0,$acc00
-       lwzx    $acc01,$Tbl0,$acc01
        rlwinm  $acc08,$s2,`32-8+3`,21,28
+       lwzx    $acc01,$Tbl0,$acc01
        rlwinm  $acc09,$s3,`32-8+3`,21,28
        lwzx    $acc02,$Tbl0,$acc02
-       lwzx    $acc03,$Tbl0,$acc03
        rlwinm  $acc10,$s0,`32-8+3`,21,28
+       lwzx    $acc03,$Tbl0,$acc03
        rlwinm  $acc11,$s1,`32-8+3`,21,28
        lwzx    $acc04,$Tbl1,$acc04
-       lwzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc12,$s3,`0+3`,21,28
+       lwzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc13,$s0,`0+3`,21,28
        lwzx    $acc06,$Tbl1,$acc06
-       lwzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc14,$s1,`0+3`,21,28
+       lwzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc15,$s2,`0+3`,21,28
        lwzx    $acc08,$Tbl2,$acc08
-       lwzx    $acc09,$Tbl2,$acc09
        xor     $t0,$t0,$acc00
+       lwzx    $acc09,$Tbl2,$acc09
        xor     $t1,$t1,$acc01
        lwzx    $acc10,$Tbl2,$acc10
-       lwzx    $acc11,$Tbl2,$acc11
        xor     $t2,$t2,$acc02
+       lwzx    $acc11,$Tbl2,$acc11
        xor     $t3,$t3,$acc03
        lwzx    $acc12,$Tbl3,$acc12
-       lwzx    $acc13,$Tbl3,$acc13
        xor     $t0,$t0,$acc04
+       lwzx    $acc13,$Tbl3,$acc13
        xor     $t1,$t1,$acc05
        lwzx    $acc14,$Tbl3,$acc14
-       lwzx    $acc15,$Tbl3,$acc15
        xor     $t2,$t2,$acc06
+       lwzx    $acc15,$Tbl3,$acc15
        xor     $t3,$t3,$acc07
        xor     $t0,$t0,$acc08
        xor     $t1,$t1,$acc09
@@ -466,60 +553,60 @@ Lenc_loop:
        addi    $Tbl2,$Tbl0,2048
        nop
        lwz     $t0,0($key)
-       lwz     $t1,4($key)
        rlwinm  $acc00,$s0,`32-24`,24,31
+       lwz     $t1,4($key)
        rlwinm  $acc01,$s1,`32-24`,24,31
        lwz     $t2,8($key)
-       lwz     $t3,12($key)
        rlwinm  $acc02,$s2,`32-24`,24,31
+       lwz     $t3,12($key)
        rlwinm  $acc03,$s3,`32-24`,24,31
        lwz     $acc08,`2048+0`($Tbl0)  ! prefetch Te4
-       lwz     $acc09,`2048+32`($Tbl0)
        rlwinm  $acc04,$s1,`32-16`,24,31
+       lwz     $acc09,`2048+32`($Tbl0)
        rlwinm  $acc05,$s2,`32-16`,24,31
        lwz     $acc10,`2048+64`($Tbl0)
-       lwz     $acc11,`2048+96`($Tbl0)
        rlwinm  $acc06,$s3,`32-16`,24,31
+       lwz     $acc11,`2048+96`($Tbl0)
        rlwinm  $acc07,$s0,`32-16`,24,31
        lwz     $acc12,`2048+128`($Tbl0)
-       lwz     $acc13,`2048+160`($Tbl0)
        rlwinm  $acc08,$s2,`32-8`,24,31
+       lwz     $acc13,`2048+160`($Tbl0)
        rlwinm  $acc09,$s3,`32-8`,24,31
        lwz     $acc14,`2048+192`($Tbl0)
-       lwz     $acc15,`2048+224`($Tbl0)
        rlwinm  $acc10,$s0,`32-8`,24,31
+       lwz     $acc15,`2048+224`($Tbl0)
        rlwinm  $acc11,$s1,`32-8`,24,31
        lbzx    $acc00,$Tbl2,$acc00
-       lbzx    $acc01,$Tbl2,$acc01
        rlwinm  $acc12,$s3,`0`,24,31
+       lbzx    $acc01,$Tbl2,$acc01
        rlwinm  $acc13,$s0,`0`,24,31
        lbzx    $acc02,$Tbl2,$acc02
-       lbzx    $acc03,$Tbl2,$acc03
        rlwinm  $acc14,$s1,`0`,24,31
+       lbzx    $acc03,$Tbl2,$acc03
        rlwinm  $acc15,$s2,`0`,24,31
        lbzx    $acc04,$Tbl2,$acc04
-       lbzx    $acc05,$Tbl2,$acc05
        rlwinm  $s0,$acc00,24,0,7
+       lbzx    $acc05,$Tbl2,$acc05
        rlwinm  $s1,$acc01,24,0,7
        lbzx    $acc06,$Tbl2,$acc06
-       lbzx    $acc07,$Tbl2,$acc07
        rlwinm  $s2,$acc02,24,0,7
+       lbzx    $acc07,$Tbl2,$acc07
        rlwinm  $s3,$acc03,24,0,7
        lbzx    $acc08,$Tbl2,$acc08
-       lbzx    $acc09,$Tbl2,$acc09
        rlwimi  $s0,$acc04,16,8,15
+       lbzx    $acc09,$Tbl2,$acc09
        rlwimi  $s1,$acc05,16,8,15
        lbzx    $acc10,$Tbl2,$acc10
-       lbzx    $acc11,$Tbl2,$acc11
        rlwimi  $s2,$acc06,16,8,15
+       lbzx    $acc11,$Tbl2,$acc11
        rlwimi  $s3,$acc07,16,8,15
        lbzx    $acc12,$Tbl2,$acc12
-       lbzx    $acc13,$Tbl2,$acc13
        rlwimi  $s0,$acc08,8,16,23
+       lbzx    $acc13,$Tbl2,$acc13
        rlwimi  $s1,$acc09,8,16,23
        lbzx    $acc14,$Tbl2,$acc14
-       lbzx    $acc15,$Tbl2,$acc15
        rlwimi  $s2,$acc10,8,16,23
+       lbzx    $acc15,$Tbl2,$acc15
        rlwimi  $s3,$acc11,8,16,23
        or      $s0,$s0,$acc12
        or      $s1,$s1,$acc13
@@ -530,29 +617,31 @@ Lenc_loop:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .align 4
 Lppc_AES_encrypt_compact:
        lwz     $acc00,240($key)
-       lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
        addi    $Tbl1,$Tbl0,2048
+       lwz     $t0,0($key)
        lis     $mask80,0x8080
+       lwz     $t1,4($key)
        lis     $mask1b,0x1b1b
-       addi    $key,$key,16
+       lwz     $t2,8($key)
        ori     $mask80,$mask80,0x8080
+       lwz     $t3,12($key)
        ori     $mask1b,$mask1b,0x1b1b
+       addi    $key,$key,16
        mtctr   $acc00
 .align 4
 Lenc_compact_loop:
        xor     $s0,$s0,$t0
        xor     $s1,$s1,$t1
-       xor     $s2,$s2,$t2
-       xor     $s3,$s3,$t3
        rlwinm  $acc00,$s0,`32-24`,24,31
+       xor     $s2,$s2,$t2
        rlwinm  $acc01,$s1,`32-24`,24,31
+       xor     $s3,$s3,$t3
        rlwinm  $acc02,$s2,`32-24`,24,31
        rlwinm  $acc03,$s3,`32-24`,24,31
        rlwinm  $acc04,$s1,`32-16`,24,31
@@ -560,48 +649,48 @@ Lenc_compact_loop:
        rlwinm  $acc06,$s3,`32-16`,24,31
        rlwinm  $acc07,$s0,`32-16`,24,31
        lbzx    $acc00,$Tbl1,$acc00
-       lbzx    $acc01,$Tbl1,$acc01
        rlwinm  $acc08,$s2,`32-8`,24,31
+       lbzx    $acc01,$Tbl1,$acc01
        rlwinm  $acc09,$s3,`32-8`,24,31
        lbzx    $acc02,$Tbl1,$acc02
-       lbzx    $acc03,$Tbl1,$acc03
        rlwinm  $acc10,$s0,`32-8`,24,31
+       lbzx    $acc03,$Tbl1,$acc03
        rlwinm  $acc11,$s1,`32-8`,24,31
        lbzx    $acc04,$Tbl1,$acc04
-       lbzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc12,$s3,`0`,24,31
+       lbzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc13,$s0,`0`,24,31
        lbzx    $acc06,$Tbl1,$acc06
-       lbzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc14,$s1,`0`,24,31
+       lbzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc15,$s2,`0`,24,31
        lbzx    $acc08,$Tbl1,$acc08
-       lbzx    $acc09,$Tbl1,$acc09
        rlwinm  $s0,$acc00,24,0,7
+       lbzx    $acc09,$Tbl1,$acc09
        rlwinm  $s1,$acc01,24,0,7
        lbzx    $acc10,$Tbl1,$acc10
-       lbzx    $acc11,$Tbl1,$acc11
        rlwinm  $s2,$acc02,24,0,7
+       lbzx    $acc11,$Tbl1,$acc11
        rlwinm  $s3,$acc03,24,0,7
        lbzx    $acc12,$Tbl1,$acc12
-       lbzx    $acc13,$Tbl1,$acc13
        rlwimi  $s0,$acc04,16,8,15
+       lbzx    $acc13,$Tbl1,$acc13
        rlwimi  $s1,$acc05,16,8,15
        lbzx    $acc14,$Tbl1,$acc14
-       lbzx    $acc15,$Tbl1,$acc15
        rlwimi  $s2,$acc06,16,8,15
+       lbzx    $acc15,$Tbl1,$acc15
        rlwimi  $s3,$acc07,16,8,15
        rlwimi  $s0,$acc08,8,16,23
        rlwimi  $s1,$acc09,8,16,23
        rlwimi  $s2,$acc10,8,16,23
        rlwimi  $s3,$acc11,8,16,23
        lwz     $t0,0($key)
-       lwz     $t1,4($key)
        or      $s0,$s0,$acc12
+       lwz     $t1,4($key)
        or      $s1,$s1,$acc13
        lwz     $t2,8($key)
-       lwz     $t3,12($key)
        or      $s2,$s2,$acc14
+       lwz     $t3,12($key)
        or      $s3,$s3,$acc15
 
        addi    $key,$key,16
@@ -612,12 +701,12 @@ Lenc_compact_loop:
        and     $acc02,$s2,$mask80
        and     $acc03,$s3,$mask80
        srwi    $acc04,$acc00,7         # r1>>7
-       srwi    $acc05,$acc01,7
-       srwi    $acc06,$acc02,7
-       srwi    $acc07,$acc03,7
        andc    $acc08,$s0,$mask80      # r0&0x7f7f7f7f
+       srwi    $acc05,$acc01,7
        andc    $acc09,$s1,$mask80
+       srwi    $acc06,$acc02,7
        andc    $acc10,$s2,$mask80
+       srwi    $acc07,$acc03,7
        andc    $acc11,$s3,$mask80
        sub     $acc00,$acc00,$acc04    # r1-(r1>>7)
        sub     $acc01,$acc01,$acc05
@@ -633,32 +722,32 @@ Lenc_compact_loop:
        and     $acc03,$acc03,$mask1b
        xor     $acc00,$acc00,$acc08    # r2
        xor     $acc01,$acc01,$acc09
+        rotlwi $acc12,$s0,16           # ROTATE(r0,16)
        xor     $acc02,$acc02,$acc10
+        rotlwi $acc13,$s1,16
        xor     $acc03,$acc03,$acc11
+        rotlwi $acc14,$s2,16
 
-       rotlwi  $acc12,$s0,16           # ROTATE(r0,16)
-       rotlwi  $acc13,$s1,16
-       rotlwi  $acc14,$s2,16
-       rotlwi  $acc15,$s3,16
        xor     $s0,$s0,$acc00          # r0^r2
+       rotlwi  $acc15,$s3,16
        xor     $s1,$s1,$acc01
-       xor     $s2,$s2,$acc02
-       xor     $s3,$s3,$acc03
        rotrwi  $s0,$s0,24              # ROTATE(r2^r0,24)
+       xor     $s2,$s2,$acc02
        rotrwi  $s1,$s1,24
+       xor     $s3,$s3,$acc03
        rotrwi  $s2,$s2,24
-       rotrwi  $s3,$s3,24
        xor     $s0,$s0,$acc00          # ROTATE(r2^r0,24)^r2
+       rotrwi  $s3,$s3,24
        xor     $s1,$s1,$acc01
        xor     $s2,$s2,$acc02
        xor     $s3,$s3,$acc03
        rotlwi  $acc08,$acc12,8         # ROTATE(r0,24)
-       rotlwi  $acc09,$acc13,8
-       rotlwi  $acc10,$acc14,8
-       rotlwi  $acc11,$acc15,8
        xor     $s0,$s0,$acc12          #
+       rotlwi  $acc09,$acc13,8
        xor     $s1,$s1,$acc13
+       rotlwi  $acc10,$acc14,8
        xor     $s2,$s2,$acc14
+       rotlwi  $acc11,$acc15,8
        xor     $s3,$s3,$acc15
        xor     $s0,$s0,$acc08          #
        xor     $s1,$s1,$acc09
@@ -673,14 +762,15 @@ Lenc_compact_done:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .globl .AES_decrypt
 .align 7
 .AES_decrypt:
-       mflr    r0
        $STU    $sp,-$FRAME($sp)
+       mflr    r0
 
-       $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
        $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
        $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
        $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
@@ -701,7 +791,14 @@ Lenc_compact_done:
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
 
+       andi.   $t0,$inp,3
+       andi.   $t1,$out,3
+       or.     $t0,$t0,$t1
+       bne     Ldec_unaligned
+
+Ldec_unaligned_ok:
        lwz     $s0,0($inp)
        lwz     $s1,4($inp)
        lwz     $s2,8($inp)
@@ -712,8 +809,80 @@ Lenc_compact_done:
        stw     $s1,4($out)
        stw     $s2,8($out)
        stw     $s3,12($out)
+       b       Ldec_done
+
+Ldec_unaligned:
+       subfic  $t0,$inp,4096
+       subfic  $t1,$out,4096
+       andi.   $t0,$t0,4096-16
+       beq     Ldec_xpage
+       andi.   $t1,$t1,4096-16
+       bne     Ldec_unaligned_ok
+
+Ldec_xpage:
+       lbz     $acc00,0($inp)
+       lbz     $acc01,1($inp)
+       lbz     $acc02,2($inp)
+       lbz     $s0,3($inp)
+       lbz     $acc04,4($inp)
+       lbz     $acc05,5($inp)
+       lbz     $acc06,6($inp)
+       lbz     $s1,7($inp)
+       lbz     $acc08,8($inp)
+       lbz     $acc09,9($inp)
+       lbz     $acc10,10($inp)
+       insrwi  $s0,$acc00,8,0
+       lbz     $s2,11($inp)
+       insrwi  $s1,$acc04,8,0
+       lbz     $acc12,12($inp)
+       insrwi  $s0,$acc01,8,8
+       lbz     $acc13,13($inp)
+       insrwi  $s1,$acc05,8,8
+       lbz     $acc14,14($inp)
+       insrwi  $s0,$acc02,8,16
+       lbz     $s3,15($inp)
+       insrwi  $s1,$acc06,8,16
+       insrwi  $s2,$acc08,8,0
+       insrwi  $s3,$acc12,8,0
+       insrwi  $s2,$acc09,8,8
+       insrwi  $s3,$acc13,8,8
+       insrwi  $s2,$acc10,8,16
+       insrwi  $s3,$acc14,8,16
+
+       bl      LAES_Td
+       bl      Lppc_AES_decrypt_compact
 
-       $POP    r0,`$FRAME-$SIZE_T*21`($sp)
+       extrwi  $acc00,$s0,8,0
+       extrwi  $acc01,$s0,8,8
+       stb     $acc00,0($out)
+       extrwi  $acc02,$s0,8,16
+       stb     $acc01,1($out)
+       stb     $acc02,2($out)
+       extrwi  $acc04,$s1,8,0
+       stb     $s0,3($out)
+       extrwi  $acc05,$s1,8,8
+       stb     $acc04,4($out)
+       extrwi  $acc06,$s1,8,16
+       stb     $acc05,5($out)
+       stb     $acc06,6($out)
+       extrwi  $acc08,$s2,8,0
+       stb     $s1,7($out)
+       extrwi  $acc09,$s2,8,8
+       stb     $acc08,8($out)
+       extrwi  $acc10,$s2,8,16
+       stb     $acc09,9($out)
+       stb     $acc10,10($out)
+       extrwi  $acc12,$s3,8,0
+       stb     $s2,11($out)
+       extrwi  $acc13,$s3,8,8
+       stb     $acc12,12($out)
+       extrwi  $acc14,$s3,8,16
+       stb     $acc13,13($out)
+       stb     $acc14,14($out)
+       stb     $s3,15($out)
+
+Ldec_done:
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
        $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
        $POP    r13,`$FRAME-$SIZE_T*19`($sp)
        $POP    r14,`$FRAME-$SIZE_T*18`($sp)
@@ -737,18 +906,21 @@ Lenc_compact_done:
        mtlr    r0
        addi    $sp,$sp,$FRAME
        blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
 
 .align 5
 Lppc_AES_decrypt:
        lwz     $acc00,240($key)
-       lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
        addi    $Tbl1,$Tbl0,3
+       lwz     $t0,0($key)
        addi    $Tbl2,$Tbl0,2
+       lwz     $t1,4($key)
        addi    $Tbl3,$Tbl0,1
+       lwz     $t2,8($key)
        addi    $acc00,$acc00,-1
+       lwz     $t3,12($key)
        addi    $key,$key,16
        xor     $s0,$s0,$t0
        xor     $s1,$s1,$t1
@@ -762,44 +934,44 @@ Ldec_loop:
        rlwinm  $acc02,$s2,`32-24+3`,21,28
        rlwinm  $acc03,$s3,`32-24+3`,21,28
        lwz     $t0,0($key)
-       lwz     $t1,4($key)
        rlwinm  $acc04,$s3,`32-16+3`,21,28
+       lwz     $t1,4($key)
        rlwinm  $acc05,$s0,`32-16+3`,21,28
        lwz     $t2,8($key)
-       lwz     $t3,12($key)
        rlwinm  $acc06,$s1,`32-16+3`,21,28
+       lwz     $t3,12($key)
        rlwinm  $acc07,$s2,`32-16+3`,21,28
        lwzx    $acc00,$Tbl0,$acc00
-       lwzx    $acc01,$Tbl0,$acc01
        rlwinm  $acc08,$s2,`32-8+3`,21,28
+       lwzx    $acc01,$Tbl0,$acc01
        rlwinm  $acc09,$s3,`32-8+3`,21,28
        lwzx    $acc02,$Tbl0,$acc02
-       lwzx    $acc03,$Tbl0,$acc03
        rlwinm  $acc10,$s0,`32-8+3`,21,28
+       lwzx    $acc03,$Tbl0,$acc03
        rlwinm  $acc11,$s1,`32-8+3`,21,28
        lwzx    $acc04,$Tbl1,$acc04
-       lwzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc12,$s1,`0+3`,21,28
+       lwzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc13,$s2,`0+3`,21,28
        lwzx    $acc06,$Tbl1,$acc06
-       lwzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc14,$s3,`0+3`,21,28
+       lwzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc15,$s0,`0+3`,21,28
        lwzx    $acc08,$Tbl2,$acc08
-       lwzx    $acc09,$Tbl2,$acc09
        xor     $t0,$t0,$acc00
+       lwzx    $acc09,$Tbl2,$acc09
        xor     $t1,$t1,$acc01
        lwzx    $acc10,$Tbl2,$acc10
-       lwzx    $acc11,$Tbl2,$acc11
        xor     $t2,$t2,$acc02
+       lwzx    $acc11,$Tbl2,$acc11
        xor     $t3,$t3,$acc03
        lwzx    $acc12,$Tbl3,$acc12
-       lwzx    $acc13,$Tbl3,$acc13
        xor     $t0,$t0,$acc04
+       lwzx    $acc13,$Tbl3,$acc13
        xor     $t1,$t1,$acc05
        lwzx    $acc14,$Tbl3,$acc14
-       lwzx    $acc15,$Tbl3,$acc15
        xor     $t2,$t2,$acc06
+       lwzx    $acc15,$Tbl3,$acc15
        xor     $t3,$t3,$acc07
        xor     $t0,$t0,$acc08
        xor     $t1,$t1,$acc09
@@ -815,56 +987,56 @@ Ldec_loop:
        addi    $Tbl2,$Tbl0,2048
        nop
        lwz     $t0,0($key)
-       lwz     $t1,4($key)
        rlwinm  $acc00,$s0,`32-24`,24,31
+       lwz     $t1,4($key)
        rlwinm  $acc01,$s1,`32-24`,24,31
        lwz     $t2,8($key)
-       lwz     $t3,12($key)
        rlwinm  $acc02,$s2,`32-24`,24,31
+       lwz     $t3,12($key)
        rlwinm  $acc03,$s3,`32-24`,24,31
        lwz     $acc08,`2048+0`($Tbl0)  ! prefetch Td4
-       lwz     $acc09,`2048+32`($Tbl0)
        rlwinm  $acc04,$s3,`32-16`,24,31
+       lwz     $acc09,`2048+32`($Tbl0)
        rlwinm  $acc05,$s0,`32-16`,24,31
        lwz     $acc10,`2048+64`($Tbl0)
-       lwz     $acc11,`2048+96`($Tbl0)
        lbzx    $acc00,$Tbl2,$acc00
+       lwz     $acc11,`2048+96`($Tbl0)
        lbzx    $acc01,$Tbl2,$acc01
        lwz     $acc12,`2048+128`($Tbl0)
-       lwz     $acc13,`2048+160`($Tbl0)
        rlwinm  $acc06,$s1,`32-16`,24,31
+       lwz     $acc13,`2048+160`($Tbl0)
        rlwinm  $acc07,$s2,`32-16`,24,31
        lwz     $acc14,`2048+192`($Tbl0)
-       lwz     $acc15,`2048+224`($Tbl0)
        rlwinm  $acc08,$s2,`32-8`,24,31
+       lwz     $acc15,`2048+224`($Tbl0)
        rlwinm  $acc09,$s3,`32-8`,24,31
        lbzx    $acc02,$Tbl2,$acc02
-       lbzx    $acc03,$Tbl2,$acc03
        rlwinm  $acc10,$s0,`32-8`,24,31
+       lbzx    $acc03,$Tbl2,$acc03
        rlwinm  $acc11,$s1,`32-8`,24,31
        lbzx    $acc04,$Tbl2,$acc04
-       lbzx    $acc05,$Tbl2,$acc05
        rlwinm  $acc12,$s1,`0`,24,31
+       lbzx    $acc05,$Tbl2,$acc05
        rlwinm  $acc13,$s2,`0`,24,31
        lbzx    $acc06,$Tbl2,$acc06
-       lbzx    $acc07,$Tbl2,$acc07
        rlwinm  $acc14,$s3,`0`,24,31
+       lbzx    $acc07,$Tbl2,$acc07
        rlwinm  $acc15,$s0,`0`,24,31
        lbzx    $acc08,$Tbl2,$acc08
-       lbzx    $acc09,$Tbl2,$acc09
        rlwinm  $s0,$acc00,24,0,7
+       lbzx    $acc09,$Tbl2,$acc09
        rlwinm  $s1,$acc01,24,0,7
        lbzx    $acc10,$Tbl2,$acc10
-       lbzx    $acc11,$Tbl2,$acc11
        rlwinm  $s2,$acc02,24,0,7
+       lbzx    $acc11,$Tbl2,$acc11
        rlwinm  $s3,$acc03,24,0,7
        lbzx    $acc12,$Tbl2,$acc12
-       lbzx    $acc13,$Tbl2,$acc13
        rlwimi  $s0,$acc04,16,8,15
+       lbzx    $acc13,$Tbl2,$acc13
        rlwimi  $s1,$acc05,16,8,15
        lbzx    $acc14,$Tbl2,$acc14
-       lbzx    $acc15,$Tbl2,$acc15
        rlwimi  $s2,$acc06,16,8,15
+       lbzx    $acc15,$Tbl2,$acc15
        rlwimi  $s3,$acc07,16,8,15
        rlwimi  $s0,$acc08,8,16,23
        rlwimi  $s1,$acc09,8,16,23
@@ -879,20 +1051,22 @@ Ldec_loop:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .align 4
 Lppc_AES_decrypt_compact:
        lwz     $acc00,240($key)
-       lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
        addi    $Tbl1,$Tbl0,2048
+       lwz     $t0,0($key)
        lis     $mask80,0x8080
+       lwz     $t1,4($key)
        lis     $mask1b,0x1b1b
-       addi    $key,$key,16
+       lwz     $t2,8($key)
        ori     $mask80,$mask80,0x8080
+       lwz     $t3,12($key)
        ori     $mask1b,$mask1b,0x1b1b
+       addi    $key,$key,16
 ___
 $code.=<<___ if ($SIZE_T==8);
        insrdi  $mask80,$mask80,32,0
@@ -904,10 +1078,10 @@ $code.=<<___;
 Ldec_compact_loop:
        xor     $s0,$s0,$t0
        xor     $s1,$s1,$t1
-       xor     $s2,$s2,$t2
-       xor     $s3,$s3,$t3
        rlwinm  $acc00,$s0,`32-24`,24,31
+       xor     $s2,$s2,$t2
        rlwinm  $acc01,$s1,`32-24`,24,31
+       xor     $s3,$s3,$t3
        rlwinm  $acc02,$s2,`32-24`,24,31
        rlwinm  $acc03,$s3,`32-24`,24,31
        rlwinm  $acc04,$s3,`32-16`,24,31
@@ -915,48 +1089,48 @@ Ldec_compact_loop:
        rlwinm  $acc06,$s1,`32-16`,24,31
        rlwinm  $acc07,$s2,`32-16`,24,31
        lbzx    $acc00,$Tbl1,$acc00
-       lbzx    $acc01,$Tbl1,$acc01
        rlwinm  $acc08,$s2,`32-8`,24,31
+       lbzx    $acc01,$Tbl1,$acc01
        rlwinm  $acc09,$s3,`32-8`,24,31
        lbzx    $acc02,$Tbl1,$acc02
-       lbzx    $acc03,$Tbl1,$acc03
        rlwinm  $acc10,$s0,`32-8`,24,31
+       lbzx    $acc03,$Tbl1,$acc03
        rlwinm  $acc11,$s1,`32-8`,24,31
        lbzx    $acc04,$Tbl1,$acc04
-       lbzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc12,$s1,`0`,24,31
+       lbzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc13,$s2,`0`,24,31
        lbzx    $acc06,$Tbl1,$acc06
-       lbzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc14,$s3,`0`,24,31
+       lbzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc15,$s0,`0`,24,31
        lbzx    $acc08,$Tbl1,$acc08
-       lbzx    $acc09,$Tbl1,$acc09
        rlwinm  $s0,$acc00,24,0,7
+       lbzx    $acc09,$Tbl1,$acc09
        rlwinm  $s1,$acc01,24,0,7
        lbzx    $acc10,$Tbl1,$acc10
-       lbzx    $acc11,$Tbl1,$acc11
        rlwinm  $s2,$acc02,24,0,7
+       lbzx    $acc11,$Tbl1,$acc11
        rlwinm  $s3,$acc03,24,0,7
        lbzx    $acc12,$Tbl1,$acc12
-       lbzx    $acc13,$Tbl1,$acc13
        rlwimi  $s0,$acc04,16,8,15
+       lbzx    $acc13,$Tbl1,$acc13
        rlwimi  $s1,$acc05,16,8,15
        lbzx    $acc14,$Tbl1,$acc14
-       lbzx    $acc15,$Tbl1,$acc15
        rlwimi  $s2,$acc06,16,8,15
+       lbzx    $acc15,$Tbl1,$acc15
        rlwimi  $s3,$acc07,16,8,15
        rlwimi  $s0,$acc08,8,16,23
        rlwimi  $s1,$acc09,8,16,23
        rlwimi  $s2,$acc10,8,16,23
        rlwimi  $s3,$acc11,8,16,23
        lwz     $t0,0($key)
-       lwz     $t1,4($key)
        or      $s0,$s0,$acc12
+       lwz     $t1,4($key)
        or      $s1,$s1,$acc13
        lwz     $t2,8($key)
-       lwz     $t3,12($key)
        or      $s2,$s2,$acc14
+       lwz     $t3,12($key)
        or      $s3,$s3,$acc15
 
        addi    $key,$key,16
@@ -1030,12 +1204,12 @@ $code.=<<___ if ($SIZE_T==4);
        and     $acc02,$s2,$mask80
        and     $acc03,$s3,$mask80
        srwi    $acc04,$acc00,7         # r1>>7
-       srwi    $acc05,$acc01,7
-       srwi    $acc06,$acc02,7
-       srwi    $acc07,$acc03,7
        andc    $acc08,$s0,$mask80      # r0&0x7f7f7f7f
+       srwi    $acc05,$acc01,7
        andc    $acc09,$s1,$mask80
+       srwi    $acc06,$acc02,7
        andc    $acc10,$s2,$mask80
+       srwi    $acc07,$acc03,7
        andc    $acc11,$s3,$mask80
        sub     $acc00,$acc00,$acc04    # r1-(r1>>7)
        sub     $acc01,$acc01,$acc05
@@ -1059,12 +1233,12 @@ $code.=<<___ if ($SIZE_T==4);
        and     $acc06,$acc02,$mask80
        and     $acc07,$acc03,$mask80
        srwi    $acc08,$acc04,7         # r1>>7
-       srwi    $acc09,$acc05,7
-       srwi    $acc10,$acc06,7
-       srwi    $acc11,$acc07,7
        andc    $acc12,$acc00,$mask80   # r2&0x7f7f7f7f
+       srwi    $acc09,$acc05,7
        andc    $acc13,$acc01,$mask80
+       srwi    $acc10,$acc06,7
        andc    $acc14,$acc02,$mask80
+       srwi    $acc11,$acc07,7
        andc    $acc15,$acc03,$mask80
        sub     $acc04,$acc04,$acc08    # r1-(r1>>7)
        sub     $acc05,$acc05,$acc09
@@ -1085,13 +1259,13 @@ $code.=<<___ if ($SIZE_T==4);
 
        and     $acc08,$acc04,$mask80   # r1=r4&0x80808080
        and     $acc09,$acc05,$mask80
-       and     $acc10,$acc06,$mask80
-       and     $acc11,$acc07,$mask80
        srwi    $acc12,$acc08,7         # r1>>7
+       and     $acc10,$acc06,$mask80
        srwi    $acc13,$acc09,7
+       and     $acc11,$acc07,$mask80
        srwi    $acc14,$acc10,7
-       srwi    $acc15,$acc11,7
        sub     $acc08,$acc08,$acc12    # r1-(r1>>7)
+       srwi    $acc15,$acc11,7
        sub     $acc09,$acc09,$acc13
        sub     $acc10,$acc10,$acc14
        sub     $acc11,$acc11,$acc15
@@ -1124,10 +1298,10 @@ ___
 $code.=<<___;
        rotrwi  $s0,$s0,8               # = ROTATE(r0,8)
        rotrwi  $s1,$s1,8
-       rotrwi  $s2,$s2,8
-       rotrwi  $s3,$s3,8
        xor     $s0,$s0,$acc00          # ^= r2^r0
+       rotrwi  $s2,$s2,8
        xor     $s1,$s1,$acc01
+       rotrwi  $s3,$s3,8
        xor     $s2,$s2,$acc02
        xor     $s3,$s3,$acc03
        xor     $acc00,$acc00,$acc08
@@ -1135,32 +1309,32 @@ $code.=<<___;
        xor     $acc02,$acc02,$acc10
        xor     $acc03,$acc03,$acc11
        xor     $s0,$s0,$acc04          # ^= r4^r0
-       xor     $s1,$s1,$acc05
-       xor     $s2,$s2,$acc06
-       xor     $s3,$s3,$acc07
        rotrwi  $acc00,$acc00,24
+       xor     $s1,$s1,$acc05
        rotrwi  $acc01,$acc01,24
+       xor     $s2,$s2,$acc06
        rotrwi  $acc02,$acc02,24
+       xor     $s3,$s3,$acc07
        rotrwi  $acc03,$acc03,24
        xor     $acc04,$acc04,$acc08
        xor     $acc05,$acc05,$acc09
        xor     $acc06,$acc06,$acc10
        xor     $acc07,$acc07,$acc11
        xor     $s0,$s0,$acc08          # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
-       xor     $s1,$s1,$acc09
-       xor     $s2,$s2,$acc10
-       xor     $s3,$s3,$acc11
        rotrwi  $acc04,$acc04,16
+       xor     $s1,$s1,$acc09
        rotrwi  $acc05,$acc05,16
+       xor     $s2,$s2,$acc10
        rotrwi  $acc06,$acc06,16
+       xor     $s3,$s3,$acc11
        rotrwi  $acc07,$acc07,16
        xor     $s0,$s0,$acc00          # ^= ROTATE(r8^r2^r0,24)
-       xor     $s1,$s1,$acc01
-       xor     $s2,$s2,$acc02
-       xor     $s3,$s3,$acc03
        rotrwi  $acc08,$acc08,8
+       xor     $s1,$s1,$acc01
        rotrwi  $acc09,$acc09,8
+       xor     $s2,$s2,$acc02
        rotrwi  $acc10,$acc10,8
+       xor     $s3,$s3,$acc03
        rotrwi  $acc11,$acc11,8
        xor     $s0,$s0,$acc04          # ^= ROTATE(r8^r4^r0,16)
        xor     $s1,$s1,$acc05
@@ -1179,7 +1353,9 @@ Ldec_compact_done:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
-.long  0
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+
 .asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 .align 7
 ___
index 7849eae..f9b6992 100644 (file)
@@ -31,7 +31,6 @@ if ($flavour =~ /32/) {
        $BNSZ=  $BITS/8;
        $SIZE_T=4;
        $RZONE= 224;
-       $FRAME= $SIZE_T*16;
 
        $LD=    "lwz";          # load
        $LDU=   "lwzu";         # load and update
@@ -51,7 +50,6 @@ if ($flavour =~ /32/) {
        $BNSZ=  $BITS/8;
        $SIZE_T=8;
        $RZONE= 288;
-       $FRAME= $SIZE_T*16;
 
        # same as above, but 64-bit mnemonics...
        $LD=    "ld";           # load
@@ -69,6 +67,9 @@ if ($flavour =~ /32/) {
        $POP=   $LD;
 } else { die "nonsense $flavour"; }
 
+$FRAME=8*$SIZE_T+$RZONE;
+$LOCALS=8*$SIZE_T;
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -89,18 +90,18 @@ $aj="r10";
 $nj="r11";
 $tj="r12";
 # non-volatile registers
-$i="r14";
-$j="r15";
-$tp="r16";
-$m0="r17";
-$m1="r18";
-$lo0="r19";
-$hi0="r20";
-$lo1="r21";
-$hi1="r22";
-$alo="r23";
-$ahi="r24";
-$nlo="r25";
+$i="r20";
+$j="r21";
+$tp="r22";
+$m0="r23";
+$m1="r24";
+$lo0="r25";
+$hi0="r26";
+$lo1="r27";
+$hi1="r28";
+$alo="r29";
+$ahi="r30";
+$nlo="r31";
 #
 $nhi="r0";
 
@@ -108,42 +109,48 @@ $code=<<___;
 .machine "any"
 .text
 
-.globl .bn_mul_mont
+.globl .bn_mul_mont_int
 .align 4
-.bn_mul_mont:
+.bn_mul_mont_int:
        cmpwi   $num,4
        mr      $rp,r3          ; $rp is reassigned
        li      r3,0
        bltlr
-
+___
+$code.=<<___ if ($BNSZ==4);
+       cmpwi   $num,32         ; longer key performance is not better
+       bgelr
+___
+$code.=<<___;
        slwi    $num,$num,`log($BNSZ)/log(2)`
        li      $tj,-4096
-       addi    $ovf,$num,`$FRAME+$RZONE`
+       addi    $ovf,$num,$FRAME
        subf    $ovf,$ovf,$sp   ; $sp-$ovf
        and     $ovf,$ovf,$tj   ; minimize TLB usage
        subf    $ovf,$sp,$ovf   ; $ovf-$sp
+       mr      $tj,$sp
        srwi    $num,$num,`log($BNSZ)/log(2)`
        $STUX   $sp,$sp,$ovf
 
-       $PUSH   r14,`4*$SIZE_T`($sp)
-       $PUSH   r15,`5*$SIZE_T`($sp)
-       $PUSH   r16,`6*$SIZE_T`($sp)
-       $PUSH   r17,`7*$SIZE_T`($sp)
-       $PUSH   r18,`8*$SIZE_T`($sp)
-       $PUSH   r19,`9*$SIZE_T`($sp)
-       $PUSH   r20,`10*$SIZE_T`($sp)
-       $PUSH   r21,`11*$SIZE_T`($sp)
-       $PUSH   r22,`12*$SIZE_T`($sp)
-       $PUSH   r23,`13*$SIZE_T`($sp)
-       $PUSH   r24,`14*$SIZE_T`($sp)
-       $PUSH   r25,`15*$SIZE_T`($sp)
+       $PUSH   r20,`-12*$SIZE_T`($tj)
+       $PUSH   r21,`-11*$SIZE_T`($tj)
+       $PUSH   r22,`-10*$SIZE_T`($tj)
+       $PUSH   r23,`-9*$SIZE_T`($tj)
+       $PUSH   r24,`-8*$SIZE_T`($tj)
+       $PUSH   r25,`-7*$SIZE_T`($tj)
+       $PUSH   r26,`-6*$SIZE_T`($tj)
+       $PUSH   r27,`-5*$SIZE_T`($tj)
+       $PUSH   r28,`-4*$SIZE_T`($tj)
+       $PUSH   r29,`-3*$SIZE_T`($tj)
+       $PUSH   r30,`-2*$SIZE_T`($tj)
+       $PUSH   r31,`-1*$SIZE_T`($tj)
 
        $LD     $n0,0($n0)      ; pull n0[0] value
        addi    $num,$num,-2    ; adjust $num for counter register
 \f
        $LD     $m0,0($bp)      ; m0=bp[0]
        $LD     $aj,0($ap)      ; ap[0]
-       addi    $tp,$sp,$FRAME
+       addi    $tp,$sp,$LOCALS
        $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[0]
        $UMULH  $hi0,$aj,$m0
 
@@ -205,8 +212,8 @@ L1st:
 Louter:
        $LDX    $m0,$bp,$i      ; m0=bp[i]
        $LD     $aj,0($ap)      ; ap[0]
-       addi    $tp,$sp,$FRAME
-       $LD     $tj,$FRAME($sp) ; tp[0]
+       addi    $tp,$sp,$LOCALS
+       $LD     $tj,$LOCALS($sp); tp[0]
        $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[i]
        $UMULH  $hi0,$aj,$m0
        $LD     $aj,$BNSZ($ap)  ; ap[1]
@@ -273,7 +280,7 @@ Linner:
 \f
        addi    $num,$num,2     ; restore $num
        subfc   $j,$j,$j        ; j=0 and "clear" XER[CA]
-       addi    $tp,$sp,$FRAME
+       addi    $tp,$sp,$LOCALS
        mtctr   $num
 
 .align 4
@@ -299,23 +306,27 @@ Lcopy:                            ; copy or in-place refresh
        addi    $j,$j,$BNSZ
        bdnz-   Lcopy
 
-       $POP    r14,`4*$SIZE_T`($sp)
-       $POP    r15,`5*$SIZE_T`($sp)
-       $POP    r16,`6*$SIZE_T`($sp)
-       $POP    r17,`7*$SIZE_T`($sp)
-       $POP    r18,`8*$SIZE_T`($sp)
-       $POP    r19,`9*$SIZE_T`($sp)
-       $POP    r20,`10*$SIZE_T`($sp)
-       $POP    r21,`11*$SIZE_T`($sp)
-       $POP    r22,`12*$SIZE_T`($sp)
-       $POP    r23,`13*$SIZE_T`($sp)
-       $POP    r24,`14*$SIZE_T`($sp)
-       $POP    r25,`15*$SIZE_T`($sp)
-       $POP    $sp,0($sp)
+       $POP    $tj,0($sp)
        li      r3,1
+       $POP    r20,`-12*$SIZE_T`($tj)
+       $POP    r21,`-11*$SIZE_T`($tj)
+       $POP    r22,`-10*$SIZE_T`($tj)
+       $POP    r23,`-9*$SIZE_T`($tj)
+       $POP    r24,`-8*$SIZE_T`($tj)
+       $POP    r25,`-7*$SIZE_T`($tj)
+       $POP    r26,`-6*$SIZE_T`($tj)
+       $POP    r27,`-5*$SIZE_T`($tj)
+       $POP    r28,`-4*$SIZE_T`($tj)
+       $POP    r29,`-3*$SIZE_T`($tj)
+       $POP    r30,`-2*$SIZE_T`($tj)
+       $POP    r31,`-1*$SIZE_T`($tj)
+       mr      $sp,$tj
        blr
        .long   0
-.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+       .byte   0,12,4,0,0x80,12,6,0
+       .long   0
+
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
index f409317..1249ce2 100644 (file)
@@ -389,7 +389,9 @@ $data=<<EOF;
        $ST             r9,`6*$BNSZ`(r3)        #r[6]=c1
        $ST             r10,`7*$BNSZ`(r3)       #r[7]=c2
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,2,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -814,8 +816,9 @@ $data=<<EOF;
 
 
        blr
-
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,2,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -966,7 +969,9 @@ $data=<<EOF;
        $ST     r10,`6*$BNSZ`(r3)       #r[6]=c1
        $ST     r11,`7*$BNSZ`(r3)       #r[7]=c2
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,3,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1502,7 +1507,9 @@ $data=<<EOF;
        $ST     r12,`14*$BNSZ`(r3)      #r[14]=c3;
        $ST     r10,`15*$BNSZ`(r3)      #r[15]=c1;
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,3,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1550,8 +1557,9 @@ Lppcasm_sub_adios:
        subfze  r3,r0           # if carry bit is set then r3 = 0 else -1
        andi.   r3,r3,1         # keep only last bit.
        blr
-       .long   0x00000000
-
+       .long   0
+       .byte   0,12,0x14,0,0,0,4,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1594,7 +1602,9 @@ Lppcasm_add_mainloop:
 Lppcasm_add_adios:     
        addze   r3,r0                   #return carry bit.
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,4,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1707,7 +1717,9 @@ Lppcasm_div8:
 Lppcasm_div9:
        or      r3,r8,r0
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,3,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1746,8 +1758,9 @@ Lppcasm_sqr_mainloop:
        bdnz-   Lppcasm_sqr_mainloop
 Lppcasm_sqr_adios:     
        blr
-       .long   0x00000000
-
+       .long   0
+       .byte   0,12,0x14,0,0,0,3,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1850,7 +1863,9 @@ Lppcasm_mw_REM:
 Lppcasm_mw_OVER:       
        addi    r3,r12,0
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,4,0
+       .long   0
 
 #
 #      NOTE:   The following label name should be changed to
@@ -1973,7 +1988,9 @@ Lppcasm_maw_leftover:
 Lppcasm_maw_adios:     
        addi    r3,r12,0
        blr
-       .long   0x00000000
+       .long   0
+       .byte   0,12,0x14,0,0,0,4,0
+       .long   0
        .align  4
 EOF
 $data =~ s/\`([^\`]*)\`/eval $1/gem;
index 3449b35..a14e769 100644 (file)
 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
 # in absolute terms, but it's apparently the way Power 6 is...
 
+# December 2009
+
+# Adapted for 32-bit build this module delivers 25-120%, yes, more
+# than *twice* for longer keys, performance improvement over 32-bit
+# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
+# even 64-bit integer operations and the trouble is that most PPC
+# operating systems don't preserve upper halves of general purpose
+# registers upon 32-bit signal delivery. They do preserve them upon
+# context switch, but not signalling:-( This means that asynchronous
+# signals have to be blocked upon entry to this subroutine. Signal
+# masking (and of course complementary unmasking) has quite an impact
+# on performance, naturally larger for shorter keys. It's so severe
+# that 512-bit key performance can be as low as 1/3 of expected one.
+# This is why this routine can be engaged for longer key operations
+# only on these OSes, see crypto/ppccap.c for further details. MacOS X
+# is an exception from this and doesn't require signal masking, and
+# that's where above improvement coefficients were collected. For
+# others alternative would be to break dependence on upper halves of
+# GPRs by sticking to 32-bit integer operations...
+
 $flavour = shift;
 
 if ($flavour =~ /32/) {
        $SIZE_T=4;
        $RZONE= 224;
-       $FRAME= $SIZE_T*12+8*12;
-       $fname= "bn_mul_mont_ppc64";
+       $fname= "bn_mul_mont_fpu64";
 
        $STUX=  "stwux";        # store indexed and update
        $PUSH=  "stw";
        $POP=   "lwz";
-       die "not implemented yet";
 } elsif ($flavour =~ /64/) {
        $SIZE_T=8;
        $RZONE= 288;
-       $FRAME= $SIZE_T*12+8*12;
-       $fname= "bn_mul_mont";
+       $fname= "bn_mul_mont_fpu64";
 
        # same as above, but 64-bit mnemonics...
        $STUX=  "stdux";        # store indexed and update
@@ -76,7 +93,7 @@ die "can't locate ppc-xlate.pl";
 
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=($FRAME+63)&~63;
+$FRAME=64;     # padded frame header
 $TRANSFER=16*8;
 
 $carry="r0";
@@ -93,16 +110,16 @@ $tp="r10";
 $j="r11";
 $i="r12";
 # non-volatile registers
-$nap_d="r14";  # interleaved ap and np in double format
-$a0="r15";     # ap[0]
-$t0="r16";     # temporary registers
-$t1="r17";
-$t2="r18";
-$t3="r19";
-$t4="r20";
-$t5="r21";
-$t6="r22";
-$t7="r23";
+$nap_d="r22";  # interleaved ap and np in double format
+$a0="r23";     # ap[0]
+$t0="r24";     # temporary registers
+$t1="r25";
+$t2="r26";
+$t3="r27";
+$t4="r28";
+$t5="r29";
+$t6="r30";
+$t7="r31";
 
 # PPC offers enough register bank capacity to unroll inner loops twice
 #
@@ -132,28 +149,17 @@ $ba="f0"; $bb="f1";       $bc="f2";       $bd="f3";
 $na="f4";      $nb="f5";       $nc="f6";       $nd="f7";
 $dota="f8";    $dotb="f9";
 $A0="f10";     $A1="f11";      $A2="f12";      $A3="f13";
-$N0="f14";     $N1="f15";      $N2="f16";      $N3="f17";
-$T0a="f18";    $T0b="f19";
-$T1a="f20";    $T1b="f21";
-$T2a="f22";    $T2b="f23";
-$T3a="f24";    $T3b="f25";
+$N0="f20";     $N1="f21";      $N2="f22";      $N3="f23";
+$T0a="f24";    $T0b="f25";
+$T1a="f26";    $T1b="f27";
+$T2a="f28";    $T2b="f29";
+$T3a="f30";    $T3b="f31";
 \f
 # sp----------->+-------------------------------+
 #              | saved sp                      |
 #              +-------------------------------+
-#              |                               |
-#              +-------------------------------+
-#              | 10 saved gpr, r14-r23         |
-#              .                               .
-#              .                               .
-#   +12*size_t +-------------------------------+
-#              | 12 saved fpr, f14-f25         |
 #              .                               .
-#              .                               .
-#   +12*8      +-------------------------------+
-#              | padding to 64 byte boundary   |
-#              .                               .
-#   +X         +-------------------------------+
+#   +64                +-------------------------------+
 #              | 16 gpr<->fpr transfer zone    |
 #              .                               .
 #              .                               .
@@ -173,6 +179,16 @@ $T3a="f24";        $T3b="f25";
 #              .                               .
 #              .                               .
 #              +-------------------------------+
+#              .                               .
+#   -12*size_t +-------------------------------+
+#              | 10 saved gpr, r22-r31         |
+#              .                               .
+#              .                               .
+#   -12*8      +-------------------------------+
+#              | 12 saved fpr, f20-f31         |
+#              .                               .
+#              .                               .
+#              +-------------------------------+
 \f
 $code=<<___;
 .machine "any"
@@ -181,14 +197,14 @@ $code=<<___;
 .globl .$fname
 .align 5
 .$fname:
-       cmpwi   $num,4
+       cmpwi   $num,`3*8/$SIZE_T`
        mr      $rp,r3          ; $rp is reassigned
        li      r3,0            ; possible "not handled" return code
        bltlr-
-       andi.   r0,$num,1       ; $num has to be even
+       andi.   r0,$num,`16/$SIZE_T-1`          ; $num has to be "even"
        bnelr-
 
-       slwi    $num,$num,3     ; num*=8
+       slwi    $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
        li      $i,-4096
        slwi    $tp,$num,2      ; place for {an}p_{lh}[num], i.e. 4*num
        add     $tp,$tp,$num    ; place for tp[num+1]
@@ -196,35 +212,50 @@ $code=<<___;
        subf    $tp,$tp,$sp     ; $sp-$tp
        and     $tp,$tp,$i      ; minimize TLB usage
        subf    $tp,$sp,$tp     ; $tp-$sp
+       mr      $i,$sp
        $STUX   $sp,$sp,$tp     ; alloca
 
-       $PUSH   r14,`2*$SIZE_T`($sp)
-       $PUSH   r15,`3*$SIZE_T`($sp)
-       $PUSH   r16,`4*$SIZE_T`($sp)
-       $PUSH   r17,`5*$SIZE_T`($sp)
-       $PUSH   r18,`6*$SIZE_T`($sp)
-       $PUSH   r19,`7*$SIZE_T`($sp)
-       $PUSH   r20,`8*$SIZE_T`($sp)
-       $PUSH   r21,`9*$SIZE_T`($sp)
-       $PUSH   r22,`10*$SIZE_T`($sp)
-       $PUSH   r23,`11*$SIZE_T`($sp)
-       stfd    f14,`12*$SIZE_T+0`($sp)
-       stfd    f15,`12*$SIZE_T+8`($sp)
-       stfd    f16,`12*$SIZE_T+16`($sp)
-       stfd    f17,`12*$SIZE_T+24`($sp)
-       stfd    f18,`12*$SIZE_T+32`($sp)
-       stfd    f19,`12*$SIZE_T+40`($sp)
-       stfd    f20,`12*$SIZE_T+48`($sp)
-       stfd    f21,`12*$SIZE_T+56`($sp)
-       stfd    f22,`12*$SIZE_T+64`($sp)
-       stfd    f23,`12*$SIZE_T+72`($sp)
-       stfd    f24,`12*$SIZE_T+80`($sp)
-       stfd    f25,`12*$SIZE_T+88`($sp)
-
+       $PUSH   r22,`-12*8-10*$SIZE_T`($i)
+       $PUSH   r23,`-12*8-9*$SIZE_T`($i)
+       $PUSH   r24,`-12*8-8*$SIZE_T`($i)
+       $PUSH   r25,`-12*8-7*$SIZE_T`($i)
+       $PUSH   r26,`-12*8-6*$SIZE_T`($i)
+       $PUSH   r27,`-12*8-5*$SIZE_T`($i)
+       $PUSH   r28,`-12*8-4*$SIZE_T`($i)
+       $PUSH   r29,`-12*8-3*$SIZE_T`($i)
+       $PUSH   r30,`-12*8-2*$SIZE_T`($i)
+       $PUSH   r31,`-12*8-1*$SIZE_T`($i)
+       stfd    f20,`-12*8`($i)
+       stfd    f21,`-11*8`($i)
+       stfd    f22,`-10*8`($i)
+       stfd    f23,`-9*8`($i)
+       stfd    f24,`-8*8`($i)
+       stfd    f25,`-7*8`($i)
+       stfd    f26,`-6*8`($i)
+       stfd    f27,`-5*8`($i)
+       stfd    f28,`-4*8`($i)
+       stfd    f29,`-3*8`($i)
+       stfd    f30,`-2*8`($i)
+       stfd    f31,`-1*8`($i)
+___
+$code.=<<___ if ($SIZE_T==8);
        ld      $a0,0($ap)      ; pull ap[0] value
        ld      $n0,0($n0)      ; pull n0[0] value
        ld      $t3,0($bp)      ; bp[0]
-
+___
+$code.=<<___ if ($SIZE_T==4);
+       mr      $t1,$n0
+       lwz     $a0,0($ap)      ; pull ap[0,1] value
+       lwz     $t0,4($ap)
+       lwz     $n0,0($t1)      ; pull n0[0,1] value
+       lwz     $t1,4($t1)
+       lwz     $t3,0($bp)      ; bp[0,1]
+       lwz     $t2,4($bp)
+       insrdi  $a0,$t0,32,0
+       insrdi  $n0,$t1,32,0
+       insrdi  $t3,$t2,32,0
+___
+$code.=<<___;
        addi    $tp,$sp,`$FRAME+$TRANSFER+8+64`
        li      $i,-64
        add     $nap_d,$tp,$num
@@ -258,6 +289,8 @@ $code=<<___;
        std     $t5,`$FRAME+40`($sp)
        std     $t6,`$FRAME+48`($sp)
        std     $t7,`$FRAME+56`($sp)
+___
+$code.=<<___ if ($SIZE_T==8);
        lwz     $t0,4($ap)              ; load a[j] as 32-bit word pair
        lwz     $t1,0($ap)
        lwz     $t2,12($ap)             ; load a[j+1] as 32-bit word pair
@@ -266,6 +299,18 @@ $code=<<___;
        lwz     $t5,0($np)
        lwz     $t6,12($np)             ; load n[j+1] as 32-bit word pair
        lwz     $t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+       lwz     $t0,0($ap)              ; load a[j..j+3] as 32-bit word pairs
+       lwz     $t1,4($ap)
+       lwz     $t2,8($ap)
+       lwz     $t3,12($ap)
+       lwz     $t4,0($np)              ; load n[j..j+3] as 32-bit word pairs
+       lwz     $t5,4($np)
+       lwz     $t6,8($np)
+       lwz     $t7,12($np)
+___
+$code.=<<___;
        lfd     $ba,`$FRAME+0`($sp)
        lfd     $bb,`$FRAME+8`($sp)
        lfd     $bc,`$FRAME+16`($sp)
@@ -374,6 +419,8 @@ $code=<<___;
 \f
 .align 5
 L1st:
+___
+$code.=<<___ if ($SIZE_T==8);
        lwz     $t0,4($ap)              ; load a[j] as 32-bit word pair
        lwz     $t1,0($ap)
        lwz     $t2,12($ap)             ; load a[j+1] as 32-bit word pair
@@ -382,6 +429,18 @@ L1st:
        lwz     $t5,0($np)
        lwz     $t6,12($np)             ; load n[j+1] as 32-bit word pair
        lwz     $t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+       lwz     $t0,0($ap)              ; load a[j..j+3] as 32-bit word pairs
+       lwz     $t1,4($ap)
+       lwz     $t2,8($ap)
+       lwz     $t3,12($ap)
+       lwz     $t4,0($np)              ; load n[j..j+3] as 32-bit word pairs
+       lwz     $t5,4($np)
+       lwz     $t6,8($np)
+       lwz     $t7,12($np)
+___
+$code.=<<___;
        std     $t0,`$FRAME+64`($sp)
        std     $t1,`$FRAME+72`($sp)
        std     $t2,`$FRAME+80`($sp)
@@ -559,7 +618,17 @@ L1st:
        li      $i,8                    ; i=1
 .align 5
 Louter:
+___
+$code.=<<___ if ($SIZE_T==8);
        ldx     $t3,$bp,$i      ; bp[i]
+___
+$code.=<<___ if ($SIZE_T==4);
+       add     $t0,$bp,$i
+       lwz     $t3,0($t0)              ; bp[i,i+1]
+       lwz     $t0,4($t0)
+       insrdi  $t3,$t0,32,0
+___
+$code.=<<___;
        ld      $t6,`$FRAME+$TRANSFER+8`($sp)   ; tp[0]
        mulld   $t7,$a0,$t3     ; ap[0]*bp[i]
 
@@ -761,6 +830,13 @@ Linner:
        stfd    $T0b,`$FRAME+8`($sp)
         add    $t7,$t7,$carry
         addc   $t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);          # adjust XER[CA]
+       extrdi  $t0,$t0,32,0
+       extrdi  $t1,$t1,32,0
+       adde    $t0,$t0,$t1
+___
+$code.=<<___;
        stfd    $T1a,`$FRAME+16`($sp)
        stfd    $T1b,`$FRAME+24`($sp)
         insrdi $t4,$t7,16,0            ; 64..127 bits
@@ -768,6 +844,13 @@ Linner:
        stfd    $T2a,`$FRAME+32`($sp)
        stfd    $T2b,`$FRAME+40`($sp)
         adde   $t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);          # adjust XER[CA]
+       extrdi  $t4,$t4,32,0
+       extrdi  $t2,$t2,32,0
+       adde    $t4,$t4,$t2
+___
+$code.=<<___;
        stfd    $T3a,`$FRAME+48`($sp)
        stfd    $T3b,`$FRAME+56`($sp)
         addze  $carry,$carry
@@ -816,7 +899,21 @@ Linner:
        ld      $t7,`$FRAME+72`($sp)
 
        addc    $t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);          # adjust XER[CA]
+       extrdi  $t0,$t0,32,0
+       extrdi  $t1,$t1,32,0
+       adde    $t0,$t0,$t1
+___
+$code.=<<___;
        adde    $t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);          # adjust XER[CA]
+       extrdi  $t4,$t4,32,0
+       extrdi  $t2,$t2,32,0
+       adde    $t4,$t4,$t2
+___
+$code.=<<___;
        addze   $carry,$carry
 
        std     $t3,-16($tp)            ; tp[j-1]
@@ -835,7 +932,9 @@ Linner:
        subf    $nap_d,$t7,$nap_d       ; rewind pointer
        cmpw    $i,$num
        blt-    Louter
+___
 \f
+$code.=<<___ if ($SIZE_T==8);
        subf    $np,$num,$np    ; rewind np
        addi    $j,$j,1         ; restore counter
        subfc   $i,$i,$i        ; j=0 and "clear" XER[CA]
@@ -883,34 +982,105 @@ Lcopy:                           ; copy or in-place refresh
        stdx    $i,$t4,$i
        addi    $i,$i,16
        bdnz-   Lcopy
+___
+$code.=<<___ if ($SIZE_T==4);
+       subf    $np,$num,$np    ; rewind np
+       addi    $j,$j,1         ; restore counter
+       subfc   $i,$i,$i        ; j=0 and "clear" XER[CA]
+       addi    $tp,$sp,`$FRAME+$TRANSFER`
+       addi    $np,$np,-4
+       addi    $rp,$rp,-4
+       addi    $ap,$sp,`$FRAME+$TRANSFER+4`
+       mtctr   $j
+
+.align 4
+Lsub:  ld      $t0,8($tp)      ; load tp[j..j+3] in 64-bit word order
+       ldu     $t2,16($tp)
+       lwz     $t4,4($np)      ; load np[j..j+3] in 32-bit word order
+       lwz     $t5,8($np)
+       lwz     $t6,12($np)
+       lwzu    $t7,16($np)
+       extrdi  $t1,$t0,32,0
+       extrdi  $t3,$t2,32,0
+       subfe   $t4,$t4,$t0     ; tp[j]-np[j]
+        stw    $t0,4($ap)      ; save tp[j..j+3] in 32-bit word order
+       subfe   $t5,$t5,$t1     ; tp[j+1]-np[j+1]
+        stw    $t1,8($ap)
+       subfe   $t6,$t6,$t2     ; tp[j+2]-np[j+2]
+        stw    $t2,12($ap)
+       subfe   $t7,$t7,$t3     ; tp[j+3]-np[j+3]
+        stwu   $t3,16($ap)
+       stw     $t4,4($rp)
+       stw     $t5,8($rp)
+       stw     $t6,12($rp)
+       stwu    $t7,16($rp)
+       bdnz-   Lsub
+
+       li      $i,0
+       subfe   $ovf,$i,$ovf    ; handle upmost overflow bit
+       addi    $tp,$sp,`$FRAME+$TRANSFER+4`
+       subf    $rp,$num,$rp    ; rewind rp
+       and     $ap,$tp,$ovf
+       andc    $np,$rp,$ovf
+       or      $ap,$ap,$np     ; ap=borrow?tp:rp
+       addi    $tp,$sp,`$FRAME+$TRANSFER`
+       mtctr   $j
+
+.align 4
+Lcopy:                         ; copy or in-place refresh
+       lwz     $t0,4($ap)
+       lwz     $t1,8($ap)
+       lwz     $t2,12($ap)
+       lwzu    $t3,16($ap)
+       std     $i,8($nap_d)    ; zap nap_d
+       std     $i,16($nap_d)
+       std     $i,24($nap_d)
+       std     $i,32($nap_d)
+       std     $i,40($nap_d)
+       std     $i,48($nap_d)
+       std     $i,56($nap_d)
+       stdu    $i,64($nap_d)
+       stw     $t0,4($rp)
+       stw     $t1,8($rp)
+       stw     $t2,12($rp)
+       stwu    $t3,16($rp)
+       std     $i,8($tp)       ; zap tp at once
+       stdu    $i,16($tp)
+       bdnz-   Lcopy
+___
 \f
-       $POP    r14,`2*$SIZE_T`($sp)
-       $POP    r15,`3*$SIZE_T`($sp)
-       $POP    r16,`4*$SIZE_T`($sp)
-       $POP    r17,`5*$SIZE_T`($sp)
-       $POP    r18,`6*$SIZE_T`($sp)
-       $POP    r19,`7*$SIZE_T`($sp)
-       $POP    r20,`8*$SIZE_T`($sp)
-       $POP    r21,`9*$SIZE_T`($sp)
-       $POP    r22,`10*$SIZE_T`($sp)
-       $POP    r23,`11*$SIZE_T`($sp)
-       lfd     f14,`12*$SIZE_T+0`($sp)
-       lfd     f15,`12*$SIZE_T+8`($sp)
-       lfd     f16,`12*$SIZE_T+16`($sp)
-       lfd     f17,`12*$SIZE_T+24`($sp)
-       lfd     f18,`12*$SIZE_T+32`($sp)
-       lfd     f19,`12*$SIZE_T+40`($sp)
-       lfd     f20,`12*$SIZE_T+48`($sp)
-       lfd     f21,`12*$SIZE_T+56`($sp)
-       lfd     f22,`12*$SIZE_T+64`($sp)
-       lfd     f23,`12*$SIZE_T+72`($sp)
-       lfd     f24,`12*$SIZE_T+80`($sp)
-       lfd     f25,`12*$SIZE_T+88`($sp)
-       $POP    $sp,0($sp)
+$code.=<<___;
+       $POP    $i,0($sp)
        li      r3,1    ; signal "handled"
+       $POP    r22,`-12*8-10*$SIZE_T`($i)
+       $POP    r23,`-12*8-9*$SIZE_T`($i)
+       $POP    r24,`-12*8-8*$SIZE_T`($i)
+       $POP    r25,`-12*8-7*$SIZE_T`($i)
+       $POP    r26,`-12*8-6*$SIZE_T`($i)
+       $POP    r27,`-12*8-5*$SIZE_T`($i)
+       $POP    r28,`-12*8-4*$SIZE_T`($i)
+       $POP    r29,`-12*8-3*$SIZE_T`($i)
+       $POP    r30,`-12*8-2*$SIZE_T`($i)
+       $POP    r31,`-12*8-1*$SIZE_T`($i)
+       lfd     f20,`-12*8`($i)
+       lfd     f21,`-11*8`($i)
+       lfd     f22,`-10*8`($i)
+       lfd     f23,`-9*8`($i)
+       lfd     f24,`-8*8`($i)
+       lfd     f25,`-7*8`($i)
+       lfd     f26,`-6*8`($i)
+       lfd     f27,`-5*8`($i)
+       lfd     f28,`-4*8`($i)
+       lfd     f29,`-3*8`($i)
+       lfd     f30,`-2*8`($i)
+       lfd     f31,`-1*8`($i)
+       mr      $sp,$i
        blr
        .long   0
-.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+       .byte   0,12,4,0,0x8c,10,6,0
+       .long   0
+
+.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/ppccap.c b/crypto/ppccap.c
new file mode 100644 (file)
index 0000000..ab89cca
--- /dev/null
@@ -0,0 +1,115 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <crypto.h>
+#include <openssl/bn.h>
+
+#define PPC_FPU64      (1<<0)
+#define PPC_ALTIVEC    (1<<1)
+
+static int OPENSSL_ppccap_P = 0;
+
+static sigset_t all_masked;
+
+#ifdef OPENSSL_BN_ASM_MONT
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num)
+       {
+       int bn_mul_mont_fpu64(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
+       int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
+
+       if (sizeof(size_t)==4)
+               {
+#if (defined(__APPLE__) && defined(__MACH__))
+               if (num>=8 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64))
+                       return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
+#else
+               /* boundary of 32 was experimentally determined on
+                  Linux 2.6.22, might have to be adjusted on AIX... */
+               if (num>=32 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64))
+                       {
+                       sigset_t oset;
+                       int ret;
+
+                       sigprocmask(SIG_SETMASK,&all_masked,&oset);
+                       ret=bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
+                       sigprocmask(SIG_SETMASK,&oset,NULL);
+
+                       return ret;
+                       }
+#endif
+               }
+       else if ((OPENSSL_ppccap_P&PPC_FPU64))
+               /* this is a "must" on POWER6, but run-time detection
+                * is not implemented yet... */
+               return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
+
+       return bn_mul_mont_int(rp,ap,bp,np,n0,num);
+       }
+#endif
+
+static sigjmp_buf ill_jmp;
+static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
+
+void OPENSSL_ppc64_probe(void);
+
+void OPENSSL_cpuid_setup(void)
+       {
+       char *e;
+       struct sigaction        ill_oact,ill_act;
+       sigset_t                oset;
+       static int trigger=0;
+
+       if (trigger) return;
+       trigger=1;
+       sigfillset(&all_masked);
+       sigdelset(&all_masked,SIGILL);
+       sigdelset(&all_masked,SIGTRAP);
+#ifdef SIGEMT
+       sigdelset(&all_masked,SIGEMT);
+#endif
+       sigdelset(&all_masked,SIGFPE);
+       sigdelset(&all_masked,SIGBUS);
+       sigdelset(&all_masked,SIGSEGV);
+
+       if ((e=getenv("OPENSSL_ppccap")))
+               {
+               OPENSSL_ppccap_P=strtoul(e,NULL,0);
+               return;
+               }
+
+       OPENSSL_ppccap_P = 0;
+
+       memset(&ill_act,0,sizeof(ill_act));
+       ill_act.sa_handler = ill_handler;
+       ill_act.sa_mask    = all_masked;
+
+       sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
+       sigaction(SIGILL,&ill_act,&ill_oact);
+
+       if (sizeof(size_t)==4)
+               {
+               if (sigsetjmp(ill_jmp,1) == 0)
+                       {
+                       OPENSSL_ppc64_probe();
+                       OPENSSL_ppccap_P |= PPC_FPU64;
+                       }
+               }
+       else
+               {
+               /*
+                * Wanted code detecting POWER6 CPU and setting PPC_FPU64
+                */
+               }
+
+       if (sigsetjmp(ill_jmp,1) == 0)
+               {
+               OPENSSL_altivec_probe();
+               OPENSSL_ppccap_P |= PPC_ALTIVEC;
+               }
+
+       sigaction (SIGILL,&ill_oact,NULL);
+       sigprocmask(SIG_SETMASK,&oset,NULL);
+       }
index 369e1d0..0677469 100755 (executable)
@@ -23,36 +23,67 @@ $code=<<___;
 .machine       "any"
 .text
 
-.globl .OPENSSL_cpuid_setup
+.globl .OPENSSL_ppc64_probe
 .align 4
-.OPENSSL_cpuid_setup:
+.OPENSSL_ppc64_probe:
+       fcfid   f1,f1
+       extrdi  r0,r0,32,0
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+
+.globl .OPENSSL_altivec_probe
+.align 4
+.OPENSSL_altivec_probe:
+       .long   0x10000484      # vor   v0,v0,v0
+       blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .globl .OPENSSL_wipe_cpu
 .align 4
 .OPENSSL_wipe_cpu:
        xor     r0,r0,r0
+       fmr     f0,f31
+       fmr     f1,f31
+       fmr     f2,f31
        mr      r3,r1
+       fmr     f3,f31
        xor     r4,r4,r4
+       fmr     f4,f31
        xor     r5,r5,r5
+       fmr     f5,f31
        xor     r6,r6,r6
+       fmr     f6,f31
        xor     r7,r7,r7
+       fmr     f7,f31
        xor     r8,r8,r8
+       fmr     f8,f31
        xor     r9,r9,r9
+       fmr     f9,f31
        xor     r10,r10,r10
+       fmr     f10,f31
        xor     r11,r11,r11
+       fmr     f11,f31
        xor     r12,r12,r12
+       fmr     f12,f31
+       fmr     f13,f31
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .globl .OPENSSL_atomic_add
 .align 4
 .OPENSSL_atomic_add:
-Loop:  lwarx   r5,0,r3
+Ladd:  lwarx   r5,0,r3
        add     r0,r4,r5
        stwcx.  r0,0,r3
-       bne-    Loop
+       bne-    Ladd
        $SIGNX  r3,r0
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,2,0
+       .long   0
 
 .globl .OPENSSL_rdtsc
 .align 4
@@ -60,6 +91,8 @@ Loop: lwarx   r5,0,r3
        mftb    r3
        mftbu   r4
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .globl .OPENSSL_cleanse
 .align 4
@@ -89,6 +122,9 @@ Laligned:
        andi.   r4,r4,3
        bne     Little
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,2,0
+       .long   0
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
index dcd0fcd..2140dd2 100755 (executable)
@@ -24,12 +24,14 @@ $flavour = shift;
 
 if ($flavour =~ /64/) {
        $SIZE_T =8;
+       $LRSAVE =2*$SIZE_T;
        $UCMP   ="cmpld";
        $STU    ="stdu";
        $POP    ="ld";
        $PUSH   ="std";
 } elsif ($flavour =~ /32/) {
        $SIZE_T =4;
+       $LRSAVE =$SIZE_T;
        $UCMP   ="cmplw";
        $STU    ="stwu";
        $POP    ="lwz";
@@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl";
 
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=24*$SIZE_T;
+$FRAME=24*$SIZE_T+64;
+$LOCALS=6*$SIZE_T;
 
 $K  ="r0";
 $sp ="r1";
@@ -162,9 +165,8 @@ $code=<<___;
 .globl .sha1_block_data_order
 .align 4
 .sha1_block_data_order:
+       $STU    $sp,-$FRAME($sp)
        mflr    r0
-       $STU    $sp,`-($FRAME+64)`($sp)
-       $PUSH   r0,`$FRAME-$SIZE_T*18`($sp)
        $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
        $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
        $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
@@ -182,6 +184,7 @@ $code=<<___;
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
        lwz     $A,0($ctx)
        lwz     $B,4($ctx)
        lwz     $C,8($ctx)
@@ -192,37 +195,14 @@ $code=<<___;
 Laligned:
        mtctr   $num
        bl      Lsha1_block_private
-Ldone:
-       $POP    r0,`$FRAME-$SIZE_T*18`($sp)
-       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
-       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
-       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
-       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
-       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
-       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
-       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
-       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
-       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
-       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
-       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
-       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
-       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
-       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
-       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
-       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
-       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
-       mtlr    r0
-       addi    $sp,$sp,`$FRAME+64`
-       blr
-___
+       b       Ldone
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for 64-byte input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for 64-byte input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
 .align 4
 Lunaligned:
        subfic  $t1,$inp,4096
@@ -237,7 +217,7 @@ Lunaligned:
 Lcross_page:
        li      $t1,16
        mtctr   $t1
-       addi    r20,$sp,$FRAME  ; spot below the frame
+       addi    r20,$sp,$LOCALS ; spot within the frame
 Lmemcpy:
        lbz     r16,0($inp)
        lbz     r17,1($inp)
@@ -251,15 +231,40 @@ Lmemcpy:
        addi    r20,r20,4
        bdnz    Lmemcpy
 
-       $PUSH   $inp,`$FRAME-$SIZE_T*19`($sp)
+       $PUSH   $inp,`$FRAME-$SIZE_T*18`($sp)
        li      $t1,1
-       addi    $inp,$sp,$FRAME
+       addi    $inp,$sp,$LOCALS
        mtctr   $t1
        bl      Lsha1_block_private
-       $POP    $inp,`$FRAME-$SIZE_T*19`($sp)
+       $POP    $inp,`$FRAME-$SIZE_T*18`($sp)
        addic.  $num,$num,-1
        bne-    Lunaligned
-       b       Ldone
+
+Ldone:
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
+       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
+       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
+       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
+       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
+       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
+       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
+       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
+       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
+       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
+       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
+       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
+       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
+       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
+       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
+       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
+       mtlr    r0
+       addi    $sp,$sp,$FRAME
+       blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
 ___
 
 # This is private block function, which uses tailored calling
@@ -309,6 +314,8 @@ $code.=<<___;
        addi    $inp,$inp,`16*4`
        bdnz-   Lsha1_block_private
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 ___
 $code.=<<___;
 .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
index 768a6a6..6b44a68 100755 (executable)
@@ -40,6 +40,7 @@ $output =shift;
 
 if ($flavour =~ /64/) {
        $SIZE_T=8;
+       $LRSAVE=2*$SIZE_T;
        $STU="stdu";
        $UCMP="cmpld";
        $SHL="sldi";
@@ -47,6 +48,7 @@ if ($flavour =~ /64/) {
        $PUSH="std";
 } elsif ($flavour =~ /32/) {
        $SIZE_T=4;
+       $LRSAVE=$SIZE_T;
        $STU="stwu";
        $UCMP="cmplw";
        $SHL="slwi";
@@ -87,7 +89,8 @@ if ($output =~ /512/) {
        $SHR="srwi";
 }
 
-$FRAME=32*$SIZE_T;
+$FRAME=32*$SIZE_T+16*$SZ;
+$LOCALS=6*$SIZE_T;
 
 $sp ="r1";
 $toc="r2";
@@ -179,13 +182,12 @@ $code=<<___;
 .globl $func
 .align 6
 $func:
+       $STU    $sp,-$FRAME($sp)
        mflr    r0
-       $STU    $sp,`-($FRAME+16*$SZ)`($sp)
        $SHL    $num,$num,`log(16*$SZ)/log(2)`
 
        $PUSH   $ctx,`$FRAME-$SIZE_T*22`($sp)
 
-       $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
        $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
        $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
        $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
@@ -206,6 +208,7 @@ $func:
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
 
        $LD     $A,`0*$SZ`($ctx)
        mr      $inp,r4                         ; incarnate $inp
@@ -217,7 +220,7 @@ $func:
        $LD     $G,`6*$SZ`($ctx)
        $LD     $H,`7*$SZ`($ctx)
 
-       b       LPICmeup
+       bl      LPICmeup
 LPICedup:
        andi.   r0,$inp,3
        bne     Lunaligned
@@ -226,40 +229,14 @@ Laligned:
        $PUSH   $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
        $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
        bl      Lsha2_block_private
-Ldone:
-       $POP    r0,`$FRAME-$SIZE_T*21`($sp)
-       $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
-       $POP    r13,`$FRAME-$SIZE_T*19`($sp)
-       $POP    r14,`$FRAME-$SIZE_T*18`($sp)
-       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
-       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
-       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
-       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
-       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
-       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
-       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
-       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
-       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
-       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
-       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
-       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
-       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
-       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
-       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
-       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
-       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
-       mtlr    r0
-       addi    $sp,$sp,`$FRAME+16*$SZ`
-       blr
-___
+       b       Ldone
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for the input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for the input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
 .align 4
 Lunaligned:
        subfic  $t1,$inp,4096
@@ -278,7 +255,7 @@ Lunaligned:
 Lcross_page:
        li      $t1,`16*$SZ/4`
        mtctr   $t1
-       addi    r20,$sp,$FRAME                  ; aligned spot below the frame
+       addi    r20,$sp,$LOCALS                 ; aligned spot below the frame
 Lmemcpy:
        lbz     r16,0($inp)
        lbz     r17,1($inp)
@@ -293,8 +270,8 @@ Lmemcpy:
        bdnz    Lmemcpy
 
        $PUSH   $inp,`$FRAME-$SIZE_T*26`($sp)   ; save real inp
-       addi    $t1,$sp,`$FRAME+16*$SZ`         ; fictitious end pointer
-       addi    $inp,$sp,$FRAME                 ; fictitious inp pointer
+       addi    $t1,$sp,`$LOCALS+16*$SZ`        ; fictitious end pointer
+       addi    $inp,$sp,$LOCALS                ; fictitious inp pointer
        $PUSH   $num,`$FRAME-$SIZE_T*25`($sp)   ; save real num
        $PUSH   $t1,`$FRAME-$SIZE_T*24`($sp)    ; end pointer
        $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
@@ -303,10 +280,36 @@ Lmemcpy:
        $POP    $num,`$FRAME-$SIZE_T*25`($sp)   ; restore real num
        addic.  $num,$num,`-16*$SZ`             ; num--
        bne-    Lunaligned
-       b       Ldone
-___
 
-$code.=<<___;
+Ldone:
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
+       $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
+       $POP    r13,`$FRAME-$SIZE_T*19`($sp)
+       $POP    r14,`$FRAME-$SIZE_T*18`($sp)
+       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
+       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
+       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
+       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
+       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
+       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
+       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
+       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
+       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
+       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
+       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
+       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
+       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
+       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
+       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
+       mtlr    r0
+       addi    $sp,$sp,$FRAME
+       blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
+
 .align 4
 Lsha2_block_private:
 ___
@@ -372,6 +375,8 @@ $code.=<<___;
        $ST     $H,`7*$SZ`($ctx)
        bne     Lsha2_block_private
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 ___
 
 # Ugly hack here, because PPC assembler syntax seem to vary too
@@ -379,22 +384,15 @@ ___
 $code.=<<___;
 .align 6
 LPICmeup:
-       bl      LPIC
-       addi    $Tbl,$Tbl,`64-4`        ; "distance" between . and last nop
-       b       LPICedup
-       nop
-       nop
-       nop
-       nop
-       nop
-LPIC:  mflr    $Tbl
+       mflr    r0
+       bcl     20,31,\$+4
+       mflr    $Tbl    ; vvvvvv "distance" between . and 1st data entry
+       addi    $Tbl,$Tbl,`64-8`
+       mtlr    r0
        blr
-       nop
-       nop
-       nop
-       nop
-       nop
-       nop
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+       .space  `64-9*4`
 ___
 $code.=<<___ if ($SZ==8);
        .long   0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd