Add sha512-ppc.pl module.
authorAndy Polyakov <appro@openssl.org>
Mon, 5 Jun 2006 09:37:55 +0000 (09:37 +0000)
committerAndy Polyakov <appro@openssl.org>
Mon, 5 Jun 2006 09:37:55 +0000 (09:37 +0000)
Configure
TABLE
crypto/sha/Makefile
crypto/sha/asm/sha512-ppc.pl [new file with mode: 0755]

index 252e7dbe2744aab5bfe48ddc34f0038f3a729fa1..4938880b5c5793d0217b21c2b7b76b23da0cc4d4 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -315,7 +315,7 @@ my %table=(
 # *-generic* is endian-neutral target, but ./config is free to
 # throw in -D[BL]_ENDIAN, whichever appropriate...
 "linux-generic32","gcc:-DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"linux-ppc",   "gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc32.o linux_ppc32-mont.o:::::sha1-ppc_linux32.o::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"linux-ppc",   "gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc32.o linux_ppc32-mont.o:::::sha1-ppc_linux32.o sha256-ppc_linux32.o::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 #### IA-32 targets...
 "linux-ia32-icc",      "icc:-DL_ENDIAN -DTERMIO -O2 -no_cpprt::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-KPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-elf",   "gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
@@ -323,7 +323,7 @@ my %table=(
 ####
 "linux-generic64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 # -bpowerpc64-linux is transient option, -m64 should be the one to use...
-"linux-ppc64", "gcc:-bpowerpc64-linux -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc64.o linux_ppc64-mont.o:::::sha1-ppc_linux64.o::::::dlfcn:linux-shared:-fPIC:-bpowerpc64-linux:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"linux-ppc64", "gcc:-bpowerpc64-linux -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc64.o linux_ppc64-mont.o:::::sha1-ppc_linux64.o sha256-ppc_linux64.o sha512-ppc_linux64.o::::::dlfcn:linux-shared:-fPIC:-bpowerpc64-linux:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-ia64",  "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
@@ -408,12 +408,12 @@ my %table=(
 
 #### IBM's AIX.
 "aix3-cc",  "cc:-O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::BN_LLONG RC4_CHAR:::",
-"aix-gcc",  "gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o aix_ppc32-mont.o:::::sha1-ppc_aix32.o::::::dlfcn:",
-"aix64-gcc","gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o aix_ppc64-mont.o:::::sha1-ppc_aix64.o::::::dlfcn::::::-X64",
+"aix-gcc",  "gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o aix_ppc32-mont.o:::::sha1-ppc_aix32.o sha256-ppc_aix32.o::::::dlfcn:",
+"aix64-gcc","gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o aix_ppc64-mont.o:::::sha1-ppc_aix64.o sha256-ppc_aix64.o sha512-ppc_aix64.o::::::dlfcn::::::-X64",
 # Below targets assume AIX 5. Idea is to effectively disregard $OBJECT_MODE
 # at build time. $OBJECT_MODE is respected at ./config stage!
-"aix-cc",   "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384::-qthreaded:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o aix_ppc32-mont.o:::::sha1-ppc_aix32.o::::::dlfcn:aix-shared::-q32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32",
-"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o aix_ppc64-mont.o:::::sha1-ppc_aix64.o::::::dlfcn:aix-shared::-q64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64",
+"aix-cc",   "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384::-qthreaded:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o aix_ppc32-mont.o:::::sha1-ppc_aix32.o sha256-ppc_aix32.o::::::dlfcn:aix-shared::-q32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32",
+"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o aix_ppc64-mont.o:::::sha1-ppc_aix64.o sha256-ppc_aix64.o sha512-ppc_aix64.o::::::dlfcn:aix-shared::-q64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64",
 
 #
 # Cray T90 and similar (SDSC)
@@ -505,8 +505,8 @@ my %table=(
 
 ##### MacOS X (a.k.a. Rhapsody or Darwin) setup
 "rhapsody-ppc-cc","cc:-O3 -DB_ENDIAN::(unknown):MACOSX_RHAPSODY::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}::",
-"darwin-ppc-cc","cc:-O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o osx_ppc32-mont.o:::::sha1-ppc_osx32.o::::::dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
-"darwin64-ppc-cc","cc:-m64 -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc64.o osx_ppc64-mont.o:::::sha1-ppc_osx64.o::::::dlfcn:darwin-shared:-fPIC -fno-common:-m64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
+"darwin-ppc-cc","cc:-O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o osx_ppc32-mont.o:::::sha1-ppc_osx32.o sha256-ppc_osx32.o::::::dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
+"darwin64-ppc-cc","cc:-m64 -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc64.o osx_ppc64-mont.o:::::sha1-ppc_osx64.o sha256-ppc_osx64.o sha512-ppc_osx64.o::::::dlfcn:darwin-shared:-fPIC -fno-common:-m64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
 "darwin-i386-cc","cc:-O3 -fomit-frame-pointer -DL_ENDIAN::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
 "debug-darwin-ppc-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DB_ENDIAN -g -Wall -O::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o osx_ppc32-mont.o:::::sha1-ppc_osx32.o::::::dlfcn:darwin-shared:-fPIC:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
 
diff --git a/TABLE b/TABLE
index 2509d1838ee53e9048b6ce6f78b6421f1bbab1d5..6b67d9295c6d2b20070e5dfc3fc42d54b6942a3e 100644 (file)
--- a/TABLE
+++ b/TABLE
@@ -714,7 +714,7 @@ $des_obj      =
 $aes_obj      = 
 $bf_obj       = 
 $md5_obj      = 
-$sha1_obj     = sha1-ppc_aix32.o
+$sha1_obj     = sha1-ppc_aix32.o sha256-ppc_aix32.o
 $cast_obj     = 
 $rc4_obj      = 
 $rmd160_obj   = 
@@ -742,7 +742,7 @@ $des_obj      =
 $aes_obj      = 
 $bf_obj       = 
 $md5_obj      = 
-$sha1_obj     = sha1-ppc_aix32.o
+$sha1_obj     = sha1-ppc_aix32.o sha256-ppc_aix32.o
 $cast_obj     = 
 $rc4_obj      = 
 $rmd160_obj   = 
@@ -798,7 +798,7 @@ $des_obj      =
 $aes_obj      = 
 $bf_obj       = 
 $md5_obj      = 
-$sha1_obj     = sha1-ppc_aix64.o
+$sha1_obj     = sha1-ppc_aix64.o sha256-ppc_aix64.o sha512-ppc_aix64.o
 $cast_obj     = 
 $rc4_obj      = 
 $rmd160_obj   = 
@@ -826,7 +826,7 @@ $des_obj      =
 $aes_obj      = 
 $bf_obj       = 
 $md5_obj      = 
-$sha1_obj     = sha1-ppc_aix64.o
+$sha1_obj     = sha1-ppc_aix64.o sha256-ppc_aix64.o sha512-ppc_aix64.o
 $cast_obj     = 
 $rc4_obj      = 
 $rmd160_obj   = 
@@ -1078,7 +1078,7 @@ $des_obj      =
 $aes_obj      = 
 $bf_obj       = 
 $md5_obj      = 
-$sha1_obj     = sha1-ppc_osx32.o
+$sha1_obj     = sha1-ppc_osx32.o sha256-ppc_osx32.o
 $cast_obj     = 
 $rc4_obj      = 
 $rmd160_obj   = 
@@ -1106,7 +1106,7 @@ $des_obj      =
 $aes_obj      = 
 $bf_obj       = 
 $md5_obj      = 
-$sha1_obj     = sha1-ppc_osx64.o
+$sha1_obj     = sha1-ppc_osx64.o sha256-ppc_osx64.o sha512-ppc_osx64.o
 $cast_obj     = 
 $rc4_obj      = 
 $rmd160_obj   = 
@@ -3010,7 +3010,7 @@ $des_obj      =
 $aes_obj      = 
 $bf_obj       = 
 $md5_obj      = 
-$sha1_obj     = sha1-ppc_linux32.o
+$sha1_obj     = sha1-ppc_linux32.o sha256-ppc_linux32.o
 $cast_obj     = 
 $rc4_obj      = 
 $rmd160_obj   = 
@@ -3038,7 +3038,7 @@ $des_obj      =
 $aes_obj      = 
 $bf_obj       = 
 $md5_obj      = 
-$sha1_obj     = sha1-ppc_linux64.o
+$sha1_obj     = sha1-ppc_linux64.o sha256-ppc_linux64.o sha512-ppc_linux64.o
 $cast_obj     = 
 $rc4_obj      = 
 $rmd160_obj   = 
index 01d8b2e469ad3086b36516a67c01e193cef9c6bd..21259e8aeef53b2fdda04866637bf5ab5463bd30 100644 (file)
@@ -73,8 +73,14 @@ sha512-x86_64.s: asm/sha512-x86_64.pl
 
 sha1-ppc_aix32.s: asm/sha1-ppc.pl;     $(PERL) asm/sha1-ppc.pl $@
 sha1-ppc_aix64.s: asm/sha1-ppc.pl;     $(PERL) asm/sha1-ppc.pl $@
+sha256-ppc_aix32.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $@
+sha256-ppc_aix64.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $@
+sha512-ppc_aix32.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $@
+sha512-ppc_aix64.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $@
 # non-AIX targets are believed to be armed with GNU make
-sha1-ppc_%.s: asm/sha1-ppc.pl;         $(PERL) $< $@
+sha1-ppc_%.s:  asm/sha1-ppc.pl;        $(PERL) $< $@
+sha256-ppc_%.s:        asm/sha512-ppc.pl;      $(PERL) $< $@
+sha512-ppc_%.s:        asm/sha512-ppc.pl;      $(PERL) $< $@
 
 files:
        $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/sha/asm/sha512-ppc.pl b/crypto/sha/asm/sha512-ppc.pl
new file mode 100755 (executable)
index 0000000..b866f93
--- /dev/null
@@ -0,0 +1,431 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. Rights for redistribution and usage in source and binary
+# forms are granted according to the OpenSSL license.
+# ====================================================================
+
+# I let hardware handle unaligned input, except on page boundaries
+# (see below for details). Otherwise straightforward implementation
+# with X vector in register bank. The module is big-endian [which is
+# not big deal as there're no little-endian targets left around].
+
+#                      sha256          |       sha512
+#                      -m64    -m32    |       -m64    -m32
+# --------------------------------------+-----------------------
+# PPC970,gcc-4.0.0     +50%    +38%    |       +40%    +410%(*)
+#
+# (*)  64-bit code in 32-bit application context, which actually is
+#      on TODO list
+
+$output=shift;
+
+if ($output =~ /64/) {
+       $SIZE_T=8;
+       $STU="stdu";
+       $UCMP="cmpld";
+       $SHL="sldi";
+       $POP="ld";
+       $PUSH="std";
+} elsif ($output =~ /32/) {
+       $SIZE_T=4;
+       $STU="stwu";
+       $UCMP="cmplw";
+       $SHL="slwi";
+       $POP="lwz";
+       $PUSH="stw";
+} else { die "nonsense $output"; }
+
+( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
+       die "can't call ../perlasm/ppc-xlate.pl: $!";
+
+if ($output =~ /512/) {
+       $func="sha512_block";
+       $SZ=8;
+       @Sigma0=(28,34,39);
+       @Sigma1=(14,18,41);
+       @sigma0=(1,  8, 7);
+       @sigma1=(19,61, 6);
+       $rounds=80;
+       $LD="ld";
+       $ST="std";
+       $ROR="rotrdi";
+       $SHR="srdi";
+} else {
+       $func="sha256_block";
+       $SZ=4;
+       @Sigma0=( 2,13,22);
+       @Sigma1=( 6,11,25);
+       @sigma0=( 7,18, 3);
+       @sigma1=(17,19,10);
+       $rounds=64;
+       $LD="lwz";
+       $ST="stw";
+       $ROR="rotrwi";
+       $SHR="srwi";
+}
+
+$FRAME=32*$SIZE_T;
+
+$sp ="r1";
+$toc="r2";     # zapped by $Tbl
+$ctx="r3";     # zapped by $a0
+$inp="r4";
+$num="r5";     # zapped by $a1
+
+$T  ="r0";
+$Tbl="r2";
+$a0 ="r3";
+$a1 ="r5";
+$t0 ="r6";
+$t1 ="r7";
+
+$A  ="r8";
+$B  ="r9";
+$C  ="r10";
+$D  ="r11";
+$E  ="r12";
+$F  ="r13";
+$G  ="r14";
+$H  ="r15";
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+@X=("r16","r17","r18","r19","r20","r21","r22","r23",
+    "r24","r25","r26","r27","r28","r29","r30","r31");
+
+sub ROUND_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$code.=<<___;
+       $LD     $T,`$i*$SZ`($Tbl)
+       $ROR    $a0,$e,$Sigma1[0]
+       $ROR    $a1,$e,$Sigma1[1]
+       and     $t0,$f,$e
+       andc    $t1,$g,$e
+       add     $T,$T,$h
+       xor     $a0,$a0,$a1
+       $ROR    $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
+       or      $t0,$t0,$t1             ; Ch(e,f,g)
+       add     $T,$T,@X[$i]
+       xor     $a0,$a0,$a1             ; Sigma1(e)
+       add     $T,$T,$t0
+       add     $T,$T,$a0
+
+       $ROR    $a0,$a,$Sigma0[0]
+       $ROR    $a1,$a,$Sigma0[1]
+       and     $t0,$a,$b
+       and     $t1,$a,$c
+       xor     $a0,$a0,$a1
+       $ROR    $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
+       xor     $t0,$t0,$t1
+       and     $t1,$b,$c
+       xor     $a0,$a0,$a1             ; Sigma0(a)
+       add     $d,$d,$T
+       xor     $t0,$t0,$t1             ; Maj(a,b,c)
+       add     $h,$T,$a0
+       add     $h,$h,$t0
+
+___
+}
+
+sub ROUND_16_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$i-=16;
+$code.=<<___;
+       $ROR    $a0,@X[($i+1)%16],$sigma0[0]
+       $ROR    $a1,@X[($i+1)%16],$sigma0[1]
+       $ROR    $t0,@X[($i+14)%16],$sigma1[0]
+       $ROR    $t1,@X[($i+14)%16],$sigma1[1]
+       xor     $a0,$a0,$a1
+       $SHR    $a1,@X[($i+1)%16],$sigma0[2]
+       xor     $t0,$t0,$t1
+       $SHR    $t1,@X[($i+14)%16],$sigma1[2]
+       add     @X[$i],@X[$i],@X[($i+9)%16]
+       xor     $a0,$a0,$a1             ; sigma0(X[(i+1)&0x0f])
+       xor     $t0,$t0,$t1             ; sigma1(X[(i+14)&0x0f])
+       add     @X[$i],@X[$i],$a0
+       add     @X[$i],@X[$i],$t0
+___
+&ROUND_00_15($i,$a,$b,$c,$d,$e,$f,$g,$h);
+}
+
+$code=<<___;
+.text
+
+.globl $func
+.align 6
+$func:
+       mflr    r0
+       $STU    $sp,`-($FRAME+16*$SZ)`($sp)
+       $SHL    $num,$num,`log(16*$SZ)/log(2)`
+
+       $PUSH   $ctx,`$FRAME-$SIZE_T*22`($sp)
+
+       $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
+       $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
+       $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
+       $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
+       $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
+       $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
+       $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
+       $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
+       $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
+       $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
+       $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
+       $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
+       $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
+       $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
+       $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
+       $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
+       $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
+       $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
+       $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
+       $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
+       $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+
+       $LD     $A,`0*$SZ`($ctx)
+       $LD     $B,`1*$SZ`($ctx)
+       $LD     $C,`2*$SZ`($ctx)
+       $LD     $D,`3*$SZ`($ctx)
+       $LD     $E,`4*$SZ`($ctx)
+       $LD     $F,`5*$SZ`($ctx)
+       $LD     $G,`6*$SZ`($ctx)
+       $LD     $H,`7*$SZ`($ctx)
+
+       b       LPICmeup
+LPICedup:
+       andi.   r0,$inp,3
+       bne     Lunaligned
+Laligned:
+       add     $t0,$inp,$num
+       $PUSH   $t0,`$FRAME-$SIZE_T*23`($sp)    ; end pointer
+       bl      Lsha2_block_private
+Ldone:
+       $POP    r0,`$FRAME-$SIZE_T*21`($sp)
+       $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
+       $POP    r13,`$FRAME-$SIZE_T*19`($sp)
+       $POP    r14,`$FRAME-$SIZE_T*18`($sp)
+       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+       $POP    r17,`$FRAME-$SIZE_T*15`($sp)
+       $POP    r18,`$FRAME-$SIZE_T*14`($sp)
+       $POP    r19,`$FRAME-$SIZE_T*13`($sp)
+       $POP    r20,`$FRAME-$SIZE_T*12`($sp)
+       $POP    r21,`$FRAME-$SIZE_T*11`($sp)
+       $POP    r22,`$FRAME-$SIZE_T*10`($sp)
+       $POP    r23,`$FRAME-$SIZE_T*9`($sp)
+       $POP    r24,`$FRAME-$SIZE_T*8`($sp)
+       $POP    r25,`$FRAME-$SIZE_T*7`($sp)
+       $POP    r26,`$FRAME-$SIZE_T*6`($sp)
+       $POP    r27,`$FRAME-$SIZE_T*5`($sp)
+       $POP    r28,`$FRAME-$SIZE_T*4`($sp)
+       $POP    r29,`$FRAME-$SIZE_T*3`($sp)
+       $POP    r30,`$FRAME-$SIZE_T*2`($sp)
+       $POP    r31,`$FRAME-$SIZE_T*1`($sp)
+       mtlr    r0
+       addi    $sp,$sp,`$FRAME+16*$SZ`
+       blr
+___
+
+# PowerPC specification allows an implementation to be ill-behaved
+# upon unaligned access which crosses page boundary. "Better safe
+# than sorry" principle makes me treat it specially. But I don't
+# look for particular offending word, but rather for the input
+# block which crosses the boundary. Once found that block is aligned
+# and hashed separately...
+$code.=<<___;
+.align 4
+Lunaligned:
+       subfic  $t1,$inp,4096
+       andi.   $t1,$t1,`4096-16*$SZ`   ; distance to closest page boundary
+       beq     Lcross_page
+       $UCMP   $num,$t1
+       ble-    Laligned                ; didn't cross the page boundary
+       subfc   $num,$t1,$num
+       add     $t0,$inp,$t1
+       $PUSH   $num,`$FRAME-$SIZE_T*24`($sp)
+       $PUSH   $t0,`$FRAME-$SIZE_T*23`($sp)    ; end pointer
+       bl      Lsha2_block_private
+       $POP    $num,`$FRAME-$SIZE_T*24`($sp)
+Lcross_page:
+       li      $t1,`16*$SZ/4`
+       mtctr   $t1
+       addi    r20,$sp,$FRAME  ; spot below the frame
+Lmemcpy:
+       lbz     r16,0($inp)
+       lbz     r17,1($inp)
+       lbz     r18,2($inp)
+       lbz     r19,3($inp)
+       addi    $inp,$inp,4
+       stb     r16,0(r20)
+       stb     r17,1(r20)
+       stb     r18,2(r20)
+       stb     r19,3(r20)
+       addi    r20,r20,4
+       bdnz    Lmemcpy
+
+       $PUSH   $inp,`$FRAME-$SIZE_T*25`($sp)
+       addi    $inp,$sp,$FRAME
+       addi    $t0,$sp,`$FRAME+16*$SZ`
+       $PUSH   $num,`$FRAME-$SIZE_T*24`($sp)
+       $PUSH   $t0,`$FRAME-$SIZE_T*23`($sp)    ; end pointer
+       bl      Lsha2_block_private
+       $POP    $inp,`$FRAME-$SIZE_T*25`($sp)
+       $POP    $num,`$FRAME-$SIZE_T*24`($sp)
+       addic.  $num,$num,`-16*$SZ`
+       bne-    Lunaligned
+       b       Ldone
+___
+
+$code.=<<___;
+.align 4
+Lsha2_block_private:
+___
+for($i=0;$i<16;$i++) {
+$code.=<<___ if ($SZ==4);
+       lwz     @X[$i],`$i*$SZ`($inp)
+___
+# 64-bit loads are split to 2x32-bit ones, as CPU can't handle
+# unaligned 64-bit loads, only 32-bit ones...
+$code.=<<___ if ($SZ==8);
+       lwz     $t0,`$i*$SZ`($inp)
+       lwz     @X[$i],`$i*$SZ+4`($inp)
+       insrdi  @X[$i],$t0,32,0
+___
+       &ROUND_00_15($i,@V);
+       unshift(@V,pop(@V));
+}
+$code.=<<___;
+       li      $T,`$rounds/16-1`
+       mtctr   $T
+.align 4
+Lrounds:
+       addi    $Tbl,$Tbl,`16*$SZ`
+___
+for(;$i<32;$i++) {
+       &ROUND_16_xx($i,@V);
+       unshift(@V,pop(@V));
+}
+$code.=<<___;
+       bdnz-   Lrounds
+
+       subi    $Tbl,$Tbl,`($rounds-16)*$SZ`
+       $POP    $ctx,`$FRAME-$SIZE_T*22`($sp)
+       $POP    $num,`$FRAME-$SIZE_T*23`($sp)   ; end pointer
+
+       $LD     r16,`0*$SZ`($ctx)
+       $LD     r17,`1*$SZ`($ctx)
+       $LD     r18,`2*$SZ`($ctx)
+       $LD     r19,`3*$SZ`($ctx)
+       $LD     r20,`4*$SZ`($ctx)
+       $LD     r21,`5*$SZ`($ctx)
+       $LD     r22,`6*$SZ`($ctx)
+       $LD     r23,`7*$SZ`($ctx)
+       add     $A,$A,r16
+       add     $B,$B,r17
+       add     $C,$C,r18
+       $ST     $A,`0*$SZ`($ctx)
+       add     $D,$D,r19
+       $ST     $B,`1*$SZ`($ctx)
+       add     $E,$E,r20
+       $ST     $C,`2*$SZ`($ctx)
+       add     $F,$F,r21
+       $ST     $D,`3*$SZ`($ctx)
+       add     $G,$G,r22
+       $ST     $E,`4*$SZ`($ctx)
+       add     $H,$H,r23
+       $ST     $F,`5*$SZ`($ctx)
+       addi    $inp,$inp,`16*$SZ`
+       $ST     $G,`6*$SZ`($ctx)
+       $UCMP   $inp,$num
+       $ST     $H,`7*$SZ`($ctx)
+       bne     Lsha2_block_private
+       blr
+___
+
+# Ugly hack here, because PPC assembler syntax seem to vary too
+# much from platforms to platform...
+$code.=<<___;
+.align 6
+LPICmeup:
+       bl      LPIC
+       b       LPICedup
+       nop
+       nop
+       nop
+       nop
+       nop
+       nop
+LPIC:  mflr    $Tbl
+       addi    $Tbl,$Tbl,`64-4`        ; "distance" between bl and last nop
+       blr
+       nop
+       nop
+       nop
+       nop
+       nop
+___
+$code.=<<___ if ($SZ==8);
+       .long   0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
+       .long   0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
+       .long   0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
+       .long   0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
+       .long   0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
+       .long   0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
+       .long   0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
+       .long   0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
+       .long   0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
+       .long   0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
+       .long   0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
+       .long   0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
+       .long   0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
+       .long   0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
+       .long   0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
+       .long   0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
+       .long   0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
+       .long   0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
+       .long   0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
+       .long   0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
+       .long   0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
+       .long   0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
+       .long   0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
+       .long   0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
+       .long   0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
+       .long   0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
+       .long   0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
+       .long   0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
+       .long   0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
+       .long   0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
+       .long   0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
+       .long   0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
+       .long   0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
+       .long   0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
+       .long   0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
+       .long   0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
+       .long   0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
+       .long   0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
+       .long   0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
+       .long   0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
+___
+$code.=<<___ if ($SZ==4);
+       .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+       .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+       .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+       .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+       .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+       .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+       .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+       .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+       .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+       .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+       .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+       .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+       .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+       .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+       .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+       .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;