Yet another "teaser" Montgomery multiplication module, for PowerPC.
authorAndy Polyakov <appro@openssl.org>
Sun, 30 Apr 2006 21:15:29 +0000 (21:15 +0000)
committerAndy Polyakov <appro@openssl.org>
Sun, 30 Apr 2006 21:15:29 +0000 (21:15 +0000)
Configure
crypto/bn/Makefile
crypto/bn/asm/ppc-mont.pl [new file with mode: 0644]
crypto/perlasm/ppc-xlate.pl [new file with mode: 0755]

index 030570a36ee01ed16ee7aa3872032d47ba91ebca..f8e4ca7afe37282c6e5e92a5f089a68da39b4bfa 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -314,7 +314,7 @@ my %table=(
 # *-generic* is endian-neutral target, but ./config is free to
 # throw in -D[BL]_ENDIAN, whichever appropriate...
 "linux-generic32","gcc:-DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"linux-ppc",   "gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc32.o:::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"linux-ppc",   "gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc32.o linix_ppc32-mont.o:::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 #### IA-32 targets...
 "linux-ia32-icc",      "icc:-DL_ENDIAN -DTERMIO -O2 -no_cpprt::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-KPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-elf",   "gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
@@ -322,7 +322,7 @@ my %table=(
 ####
 "linux-generic64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 # -bpowerpc64-linux is transient option, -m64 should be the one to use...
-"linux-ppc64", "gcc:-bpowerpc64-linux -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc64.o:::::::::::dlfcn:linux-shared:-fPIC:-bpowerpc64-linux:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"linux-ppc64", "gcc:-bpowerpc64-linux -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc64.o linux_ppc64-mont.o:::::::::::dlfcn:linux-shared:-fPIC:-bpowerpc64-linux:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-ia64",  "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
@@ -407,12 +407,12 @@ my %table=(
 
 #### IBM's AIX.
 "aix3-cc",  "cc:-O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::BN_LLONG RC4_CHAR:::",
-"aix-gcc",  "gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o:::::::::::dlfcn:",
-"aix64-gcc","gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o:::::::::::dlfcn::::::-X64",
+"aix-gcc",  "gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o aix_ppc32-mont.o:::::::::::dlfcn:",
+"aix64-gcc","gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o aix_ppc64-mont.o:::::::::::dlfcn::::::-X64",
 # Below targets assume AIX 5. Idea is to effectively disregard $OBJECT_MODE
 # at build time. $OBJECT_MODE is respected at ./config stage!
-"aix-cc",   "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384::-qthreaded:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o:::::::::::dlfcn:aix-shared::-q32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32",
-"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o:::::::::::dlfcn:aix-shared::-q64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64",
+"aix-cc",   "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384::-qthreaded:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o aix_ppc32-mont.o:::::::::::dlfcn:aix-shared::-q32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32",
+"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o aix_ppc64-mont.o:::::::::::dlfcn:aix-shared::-q64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64",
 
 #
 # Cray T90 and similar (SDSC)
@@ -504,9 +504,10 @@ my %table=(
 
 ##### MacOS X (a.k.a. Rhapsody or Darwin) setup
 "rhapsody-ppc-cc","cc:-O3 -DB_ENDIAN::(unknown):MACOSX_RHAPSODY::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}::",
-"darwin-ppc-cc","cc:-O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o:::::::::::dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
+"darwin-ppc-cc","cc:-O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o osx_ppc32-mont.o:::::::::::dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
+"darwin64-ppc-cc","cc:-m64 -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc64.o osx_ppc64-mont.o:::::::::::dlfcn:darwin-shared:-fPIC -fno-common:-m64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
 "darwin-i386-cc","cc:-O3 -fomit-frame-pointer -DL_ENDIAN::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
-"debug-darwin-ppc-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DB_ENDIAN -g -Wall -O::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o:::::::::::dlfcn:darwin-shared:-fPIC:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
+"debug-darwin-ppc-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DB_ENDIAN -g -Wall -O::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o osx_ppc32-mont.o:::::::::::dlfcn:darwin-shared:-fPIC:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
 
 ##### A/UX
 "aux3-gcc","gcc:-O2 -DTERMIO::(unknown):AUX:-lbsd:RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:::",
index e494c02ee8e928ca9e2ae446c8b6183f03725fd2..8802dde70e0c9ce80ac99840a4a0808681edc84a 100644 (file)
@@ -120,6 +120,14 @@ linux_ppc64.s: asm/ppc.pl; $(PERL) $< $@
 aix_ppc32.s: asm/ppc.pl;       $(PERL) asm/ppc.pl $@
 aix_ppc64.s: asm/ppc.pl;       $(PERL) asm/ppc.pl $@
 osx_ppc32.s: asm/ppc.pl;       $(PERL) $< $@
+osx_ppc64.s: asm/ppc.pl;       $(PERL) $< $@
+
+linux_ppc32-mont.s: asm/ppc-mont.pl;   $(PERL) $< $@
+linux_ppc64-mont.s: asm/ppc-mont.pl;   $(PERL) $< $@
+aix_ppc32-mont.s: asm/ppc-mont.pl;     $(PERL) asm/ppc-mont.pl $@
+aix_ppc64-mont.s: asm/ppc-mont.pl;     $(PERL) asm/ppc-mont.pl $@
+osx_ppc32-mont.s: asm/ppc-mont.pl;     $(PERL) $< $@
+osx_ppc64-mont.s: asm/ppc-mont.pl;     $(PERL) $< $@
 
 files:
        $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl
new file mode 100644 (file)
index 0000000..8a26021
--- /dev/null
@@ -0,0 +1,327 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. Rights for redistribution and usage in source and binary
+# forms are granted according to the OpenSSL license.
+# ====================================================================
+
+# April 2006
+
+# "Teaser" Montgomery multiplication module for PowerPC. It's possible
+# to gain a bit more by modulo-scheduling outer loop, then dedicated
+# squaring procedure should give further 20% and code can be adapted
+# for 32-bit application running on 64-bit CPU. As for the latter.
+# It won't be able to achieve "native" 64-bit performance, because in
+# 32-bit application context every addc instruction will have to be
+# expanded as addc, twice right shift by 32 and finally adde, etc.
+# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
+# for 64-bit application running on PPC970/G5 is:
+#
+# 512-bit      +65%    
+# 1024-bit     +35%
+# 2048-bit     +18%
+# 4096-bit     +4%
+
+$output = shift;
+
+if ($output =~ /32\-mont\.s/) {
+       $BITS=  32;
+       $BNSZ=  $BITS/8;
+       $SIZE_T=4;
+       $RZONE= 224;
+       $FRAME= $SIZE_T*16;
+
+       $LD=    "lwz";          # load
+       $LDU=   "lwzu";         # load and update
+       $LDX=   "lwzx";         # load indexed
+       $ST=    "stw";          # store
+       $STU=   "stwu";         # store and update
+       $STX=   "stwx";         # store indexed
+       $STUX=  "stwux";        # store indexed and update
+       $UMULL= "mullw";        # unsigned multiply low
+       $UMULH= "mulhwu";       # unsigned multiply high
+       $UCMP=  "cmplw";        # unsigned compare
+       $PUSH=  $ST;
+       $POP=   $LD;
+} elsif ($output =~ /64\-mont\.s/) {
+       $BITS=  64;
+       $BNSZ=  $BITS/8;
+       $SIZE_T=8;
+       $RZONE= 288;
+       $FRAME= $SIZE_T*16;
+
+       # same as above, but 64-bit mnemonics...
+       $LD=    "ld";           # load
+       $LDU=   "ldu";          # load and update
+       $LDX=   "ldx";          # load indexed
+       $ST=    "std";          # store
+       $STU=   "stdu";         # store and update
+       $STX=   "stdx";         # store indexed
+       $STUX=  "stdux";        # store indexed and update
+       $UMULL= "mulld";        # unsigned multiply low
+       $UMULH= "mulhdu";       # unsigned multiply high
+       $UCMP=  "cmpld";        # unsigned compare
+       $PUSH=  $ST;
+       $POP=   $LD;
+} else { die "nonsense $output"; }
+
+( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
+       die "can't call ../perlasm/ppc-xlate.pl: $!";
+
+$sp="r1";
+$toc="r2";
+$rp="r3";      $ovf="r3";
+$ap="r4";
+$bp="r5";
+$np="r6";
+$n0="r7";
+$num="r8";
+$rp="r9";      # $rp is reassigned
+$aj="r10";
+$nj="r11";
+$tj="r12";
+# non-volatile registers
+$i="r14";
+$j="r15";
+$tp="r16";
+$m0="r17";
+$m1="r18";
+$lo0="r19";
+$hi0="r20";
+$lo1="r21";
+$hi1="r22";
+$alo="r23";
+$ahi="r24";
+$nlo="r25";
+#
+$nhi="r0";
+
+$code=<<___;
+.text
+
+.globl .bn_mul_mont
+.align 4
+.bn_mul_mont:
+       cmpwi   $num,4
+       mr      $rp,r3          ; $rp is reassigned
+       li      r3,0
+       bltlr
+
+       slwi    $num,$num,`log($BNSZ)/log(2)`
+       li      $tj,-4096
+       addi    $ovf,$num,`$FRAME+$RZONE`
+       subf    $ovf,$ovf,$sp   ; $sp-$ovf
+       and     $ovf,$ovf,$tj   ; minimize TLB usage
+       subf    $ovf,$sp,$ovf   ; $ovf-$sp
+       srwi    $num,$num,`log($BNSZ)/log(2)`
+       $STUX   $sp,$sp,$ovf
+
+       $PUSH   r14,`4*$SIZE_T`($sp)
+       $PUSH   r15,`5*$SIZE_T`($sp)
+       $PUSH   r16,`6*$SIZE_T`($sp)
+       $PUSH   r17,`7*$SIZE_T`($sp)
+       $PUSH   r18,`8*$SIZE_T`($sp)
+       $PUSH   r19,`9*$SIZE_T`($sp)
+       $PUSH   r20,`10*$SIZE_T`($sp)
+       $PUSH   r21,`11*$SIZE_T`($sp)
+       $PUSH   r22,`12*$SIZE_T`($sp)
+       $PUSH   r23,`13*$SIZE_T`($sp)
+       $PUSH   r24,`14*$SIZE_T`($sp)
+       $PUSH   r25,`15*$SIZE_T`($sp)
+
+       $LD     $n0,0($n0)      ; pull n0[0] value
+       addi    $num,$num,-2    ; adjust $num for counter register
+\f
+       $LD     $m0,0($bp)      ; m0=bp[0]
+       $LD     $aj,0($ap)      ; ap[0]
+       addi    $tp,$sp,$FRAME
+       $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[0]
+       $UMULH  $hi0,$aj,$m0
+
+       $LD     $aj,$BNSZ($ap)  ; ap[1]
+       $LD     $nj,0($np)      ; np[0]
+
+       $UMULL  $m1,$lo0,$n0    ; "tp[0]"*n0
+
+       $UMULL  $alo,$aj,$m0    ; ap[1]*bp[0]
+       $UMULH  $ahi,$aj,$m0
+
+       $UMULL  $lo1,$nj,$m1    ; np[0]*m1
+       $UMULH  $hi1,$nj,$m1
+       $LD     $nj,$BNSZ($np)  ; np[1]
+       addc    $lo1,$lo1,$lo0
+       addze   $hi1,$hi1
+
+       $UMULL  $nlo,$nj,$m1    ; np[1]*m1
+       $UMULH  $nhi,$nj,$m1
+
+       mtctr   $num
+       li      $j,`2*$BNSZ`
+.align 4
+L1st:
+       $LDX    $aj,$ap,$j      ; ap[j]
+       $LDX    $nj,$np,$j      ; np[j]
+       addc    $lo0,$alo,$hi0
+       addze   $hi0,$ahi
+       $UMULL  $alo,$aj,$m0    ; ap[j]*bp[0]
+       $UMULH  $ahi,$aj,$m0
+
+       addc    $lo1,$nlo,$hi1
+       addze   $hi1,$nhi
+       $UMULL  $nlo,$nj,$m1    ; np[j]*m1
+       $UMULH  $nhi,$nj,$m1
+       addc    $lo1,$lo1,$lo0  ; np[j]*m1+ap[j]*bp[0]
+       addze   $hi1,$hi1
+       $ST     $lo1,0($tp)     ; tp[j-1]
+
+       addi    $j,$j,$BNSZ     ; j++
+       addi    $tp,$tp,$BNSZ   ; tp++
+       bdnz-   L1st
+;L1st
+       addc    $lo0,$alo,$hi0
+       addze   $hi0,$ahi
+
+       addc    $lo1,$nlo,$hi1
+       addze   $hi1,$nhi
+       addc    $lo1,$lo1,$lo0  ; np[j]*m1+ap[j]*bp[0]
+       addze   $hi1,$hi1
+       $ST     $lo1,0($tp)     ; tp[j-1]
+
+       li      $ovf,0
+       addc    $hi1,$hi1,$hi0
+       addze   $ovf,$ovf       ; upmost overflow bit
+       $ST     $hi1,$BNSZ($tp)
+\f
+       li      $i,$BNSZ
+.align 4
+Louter:
+       $LDX    $m0,$bp,$i      ; m0=bp[i]
+       $LD     $aj,0($ap)      ; ap[0]
+       addi    $tp,$sp,$FRAME
+       $LD     $tj,$FRAME($sp) ; tp[0]
+       $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[i]
+       $UMULH  $hi0,$aj,$m0
+       $LD     $aj,$BNSZ($ap)  ; ap[1]
+       $LD     $nj,0($np)      ; np[0]
+       addc    $lo0,$lo0,$tj   ; ap[0]*bp[i]+tp[0]
+       addze   $hi0,$hi0
+
+       $UMULL  $m1,$lo0,$n0    ; tp[0]*n0
+
+       $UMULL  $alo,$aj,$m0    ; ap[j]*bp[i]
+       $UMULH  $ahi,$aj,$m0
+
+       $UMULL  $lo1,$nj,$m1    ; np[0]*m1
+       $UMULH  $hi1,$nj,$m1
+       $LD     $nj,$BNSZ($np)  ; np[1]
+       addc    $lo1,$lo1,$lo0
+       addze   $hi1,$hi1
+
+       $UMULL  $nlo,$nj,$m1    ; np[1]*m1
+       $UMULH  $nhi,$nj,$m1
+\f
+       mtctr   $num
+       li      $j,`2*$BNSZ`
+.align 4
+Linner:
+       $LDX    $aj,$ap,$j      ; ap[j]
+       $LD     $tj,$BNSZ($tp)  ; tp[j]
+       addc    $lo0,$alo,$hi0
+       addze   $hi0,$ahi
+       $LDX    $nj,$np,$j      ; np[j]
+       addc    $lo0,$lo0,$tj   ; ap[j]*bp[i]+tp[j]
+       addze   $hi0,$hi0
+       $UMULL  $alo,$aj,$m0    ; ap[j]*bp[i]
+       $UMULH  $ahi,$aj,$m0
+
+       addc    $lo1,$nlo,$hi1
+       addze   $hi1,$nhi
+       $UMULL  $nlo,$nj,$m1    ; np[j]*m1
+       $UMULH  $nhi,$nj,$m1
+       addc    $lo1,$lo1,$lo0  ; np[j]*m1+ap[j]*bp[i]+tp[j]
+       addze   $hi1,$hi1
+       $ST     $lo1,0($tp)     ; tp[j-1]
+
+       addi    $j,$j,$BNSZ     ; j++
+       addi    $tp,$tp,$BNSZ   ; tp++
+       bdnz-   Linner
+;Linner
+       $LD     $tj,$BNSZ($tp)  ; tp[j]
+       addc    $lo0,$alo,$hi0
+       addze   $hi0,$ahi
+       addc    $lo0,$lo0,$tj   ; ap[j]*bp[i]+tp[j]
+       addze   $hi0,$hi0
+
+       addc    $lo1,$nlo,$hi1
+       addze   $hi1,$nhi
+       addc    $lo1,$lo1,$lo0  ; np[j]*m1+ap[j]*bp[i]+tp[j]
+       addze   $hi1,$hi1
+       $ST     $lo1,0($tp)     ; tp[j-1]
+
+       addic   $ovf,$ovf,-1    ; move upmost overflow to XER[CA]
+       li      $ovf,0
+       adde    $hi1,$hi1,$hi0
+       addze   $ovf,$ovf
+       $ST     $hi1,$BNSZ($tp)
+;
+       slwi    $tj,$num,`log($BNSZ)/log(2)`
+       $UCMP   $i,$tj
+       addi    $i,$i,$BNSZ
+       ble-    Louter
+\f
+       addi    $num,$num,2     ; restore $num
+       addi    $tp,$sp,$FRAME
+       mtctr   $num
+       li      $j,0
+
+       subfc.  $ovf,$j,$ovf    ; sets XER[CA]
+       bne     Lsub
+       $UCMP   $hi1,$nj
+       bge     Lsub
+.align 4
+Lcopy:
+       $LDX    $tj,$tp,$j
+       $STX    $tj,$rp,$j
+       $STX    $j,$tp,$j       ; zap at once
+       addi    $j,$j,$BNSZ
+       bdnz-   Lcopy
+
+Lexit:
+       $POP    r14,`4*$SIZE_T`($sp)
+       $POP    r15,`5*$SIZE_T`($sp)
+       $POP    r16,`6*$SIZE_T`($sp)
+       $POP    r17,`7*$SIZE_T`($sp)
+       $POP    r18,`8*$SIZE_T`($sp)
+       $POP    r19,`9*$SIZE_T`($sp)
+       $POP    r20,`10*$SIZE_T`($sp)
+       $POP    r21,`11*$SIZE_T`($sp)
+       $POP    r22,`12*$SIZE_T`($sp)
+       $POP    r23,`13*$SIZE_T`($sp)
+       $POP    r24,`14*$SIZE_T`($sp)
+       $POP    r25,`15*$SIZE_T`($sp)
+       $POP    $sp,0($sp)
+       li      r3,1
+       blr
+       .long   0
+.align 4
+Lsub:  $LDX    $tj,$tp,$j
+       $LDX    $nj,$np,$j
+       subfe   $tj,$nj,$tj     ; tp[j]-np[j]
+       $STX    $tj,$rp,$j
+       addi    $j,$j,$BNSZ
+       bdnz-   Lsub
+       li      $j,0
+       subfe.  $ovf,$j,$ovf
+       mtctr   $num
+       bne     Lcopy
+.align 4
+Lzap:  $STX    $j,$tp,$j
+       addi    $j,$j,$BNSZ
+       bdnz-   Lzap
+       b       Lexit
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl
new file mode 100755 (executable)
index 0000000..31876e5
--- /dev/null
@@ -0,0 +1,113 @@
+#!/usr/bin/env perl
+
+# PowerPC assembler distiller by <appro>.
+
+my $output = shift;
+open STDOUT,">$output" || die "can't open $output: $!";
+
+my $flavour = $output;
+my %GLOBALS;
+my $dotinlocallabels=0;
+
+################################################################
+# directives which need special treatment on different platforms
+################################################################
+my $globl = sub {
+    my $junk = shift;
+    my $name = shift;
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    $name =~ s|^[\.\_]||;
+    SWITCH: for ($flavour) {
+       /aix/           && do { $name = ".$name";
+                               last;
+                             };
+       /osx/           && do { $name = "_$name";
+                               last;
+                             };
+       /linux.*32/     && do { $ret .= ".globl $name\n";
+                               $ret .= ".type  $name,\@function";
+                               $dotinlocallabels = 1;
+                               last;
+                             };
+       /linux.*64/     && do { $ret .= ".globl .$name\n";
+                               $ret .= ".type  .$name,\@function\n";
+                               $ret .= ".section       \".opd\",\"aw\"\n";
+                               $ret .= ".globl $name\n";
+                               $ret .= ".align 3\n";
+                               $ret .= "$name:\n";
+                               $ret .= ".quad  .$name,.TOC.\@tocbase,0\n";
+                               $ret .= ".size  $name,24\n";
+                               $ret .= ".previous\n";
+
+                               $name = ".$name";
+                               $dotinlocallabels = 1;
+                               last;
+                             };
+    }
+
+    $ret = ".globl     $name" if (!$ret);
+    $$global = $name;
+    $ret;
+};
+my $machine = sub {
+    my $junk = shift;
+    my $arch = shift;
+    $arch = "ppc970" if ($arch eq "any" and $flavour =~ /osx/);
+    ".machine  $arch";
+};
+
+################################################################
+# simplified mnemonics not handled by at least one assembler
+################################################################
+my $cmplw = sub {
+    my $f = shift;
+    my $cr = 0; $cr = shift if ($#_>1);
+    "  cmpl$f  ".join(',',$cr,0,@_);
+};
+my $cmpld = sub {
+    my $f = shift;
+    my $cr = 0; $cr = shift if ($#_>1);
+    "  cmpl$f  ".join(',',$cr,1,@_);
+};
+my $bdnz = sub {
+    my $f = shift;
+    my $bo = $f=~/[\+\-]/ ? 17 : 16;
+    "  bc      $bo,0,".shift;
+};
+
+while($line=<>) {
+
+    $line =~ s|[#!;].*$||;     # get rid of asm-style comments...
+    $line =~ s|/\*.*\*/||;     # ... and C-style comments...
+    $line =~ s|^\s+||;         # ... and skip white spaces in beginning...
+    $line =~ s|\s+$||;         # ... and at the end
+
+    {
+       $line =~ s|\b\.L(\w+)|L$1|g;    # common denominator for Locallabel
+       $line =~ s|\bL(\w+)|\.L$1|g     if ($dotinlocallabels);
+    }
+
+    {
+       $line =~ s|(^[\.\w]+)\:\s*||;
+       my $label = $1;
+       printf "%s:",($GLOBALS{$label} or $label) if ($label);
+    }
+
+    {
+       $line =~ s|^\s*(\.?)(\w+)([\.\+\-]?)\s*||;
+       my $c = $1; $c = "\t" if ($c eq "");
+       my $mnemonic = $2;
+       my $f = $3;
+       my $opcode = eval("\$$mnemonic");
+       if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); }
+       elsif ($mnemonic)           { $line = $c.$mnemonic.$f."\t".$line; }
+    }
+
+    print $line if ($line);
+    print "\n";
+}
+
+close STDOUT;