From 2c5d4daac5d23416ab619bfbcb0189227ffdf71f Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 30 Apr 2006 21:15:29 +0000 Subject: [PATCH] Yet another "teaser" Montgomery multiplication module, for PowerPC. --- Configure | 17 +- crypto/bn/Makefile | 8 + crypto/bn/asm/ppc-mont.pl | 327 ++++++++++++++++++++++++++++++++++++ crypto/perlasm/ppc-xlate.pl | 113 +++++++++++++ 4 files changed, 457 insertions(+), 8 deletions(-) create mode 100644 crypto/bn/asm/ppc-mont.pl create mode 100755 crypto/perlasm/ppc-xlate.pl diff --git a/Configure b/Configure index 030570a36e..f8e4ca7afe 100755 --- a/Configure +++ b/Configure @@ -314,7 +314,7 @@ my %table=( # *-generic* is endian-neutral target, but ./config is free to # throw in -D[BL]_ENDIAN, whichever appropriate... "linux-generic32","gcc:-DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"linux-ppc", "gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc32.o:::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"linux-ppc", "gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc32.o linix_ppc32-mont.o:::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", #### IA-32 targets... "linux-ia32-icc", "icc:-DL_ENDIAN -DTERMIO -O2 -no_cpprt::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-KPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-elf", "gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", @@ -322,7 +322,7 @@ my %table=( #### "linux-generic64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", # -bpowerpc64-linux is transient option, -m64 should be the one to use... -"linux-ppc64", "gcc:-bpowerpc64-linux -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc64.o:::::::::::dlfcn:linux-shared:-fPIC:-bpowerpc64-linux:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"linux-ppc64", "gcc:-bpowerpc64-linux -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc64.o linux_ppc64-mont.o:::::::::::dlfcn:linux-shared:-fPIC:-bpowerpc64-linux:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", @@ -407,12 +407,12 @@ my %table=( #### IBM's AIX. "aix3-cc", "cc:-O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::BN_LLONG RC4_CHAR:::", -"aix-gcc", "gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o:::::::::::dlfcn:", -"aix64-gcc","gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o:::::::::::dlfcn::::::-X64", +"aix-gcc", "gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o aix_ppc32-mont.o:::::::::::dlfcn:", +"aix64-gcc","gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o aix_ppc64-mont.o:::::::::::dlfcn::::::-X64", # Below targets assume AIX 5. Idea is to effectively disregard $OBJECT_MODE # at build time. $OBJECT_MODE is respected at ./config stage! -"aix-cc", "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384::-qthreaded:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o:::::::::::dlfcn:aix-shared::-q32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32", -"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o:::::::::::dlfcn:aix-shared::-q64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64", +"aix-cc", "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384::-qthreaded:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o aix_ppc32-mont.o:::::::::::dlfcn:aix-shared::-q32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32", +"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o aix_ppc64-mont.o:::::::::::dlfcn:aix-shared::-q64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64", # # Cray T90 and similar (SDSC) @@ -504,9 +504,10 @@ my %table=( ##### MacOS X (a.k.a. Rhapsody or Darwin) setup "rhapsody-ppc-cc","cc:-O3 -DB_ENDIAN::(unknown):MACOSX_RHAPSODY::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}::", -"darwin-ppc-cc","cc:-O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o:::::::::::dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", +"darwin-ppc-cc","cc:-O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o osx_ppc32-mont.o:::::::::::dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", +"darwin64-ppc-cc","cc:-m64 -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc64.o osx_ppc64-mont.o:::::::::::dlfcn:darwin-shared:-fPIC -fno-common:-m64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "darwin-i386-cc","cc:-O3 -fomit-frame-pointer -DL_ENDIAN::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", -"debug-darwin-ppc-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DB_ENDIAN -g -Wall -O::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o:::::::::::dlfcn:darwin-shared:-fPIC:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", +"debug-darwin-ppc-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DB_ENDIAN -g -Wall -O::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o osx_ppc32-mont.o:::::::::::dlfcn:darwin-shared:-fPIC:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", ##### A/UX "aux3-gcc","gcc:-O2 -DTERMIO::(unknown):AUX:-lbsd:RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:::", diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile index e494c02ee8..8802dde70e 100644 --- a/crypto/bn/Makefile +++ b/crypto/bn/Makefile @@ -120,6 +120,14 @@ linux_ppc64.s: asm/ppc.pl; $(PERL) $< $@ aix_ppc32.s: asm/ppc.pl; $(PERL) asm/ppc.pl $@ aix_ppc64.s: asm/ppc.pl; $(PERL) asm/ppc.pl $@ osx_ppc32.s: asm/ppc.pl; $(PERL) $< $@ +osx_ppc64.s: asm/ppc.pl; $(PERL) $< $@ + +linux_ppc32-mont.s: asm/ppc-mont.pl; $(PERL) $< $@ +linux_ppc64-mont.s: asm/ppc-mont.pl; $(PERL) $< $@ +aix_ppc32-mont.s: asm/ppc-mont.pl; $(PERL) asm/ppc-mont.pl $@ +aix_ppc64-mont.s: asm/ppc-mont.pl; $(PERL) asm/ppc-mont.pl $@ +osx_ppc32-mont.s: asm/ppc-mont.pl; $(PERL) $< $@ +osx_ppc64-mont.s: asm/ppc-mont.pl; $(PERL) $< $@ files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl new file mode 100644 index 0000000000..8a260212b7 --- /dev/null +++ b/crypto/bn/asm/ppc-mont.pl @@ -0,0 +1,327 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. Rights for redistribution and usage in source and binary +# forms are granted according to the OpenSSL license. +# ==================================================================== + +# April 2006 + +# "Teaser" Montgomery multiplication module for PowerPC. It's possible +# to gain a bit more by modulo-scheduling outer loop, then dedicated +# squaring procedure should give further 20% and code can be adapted +# for 32-bit application running on 64-bit CPU. As for the latter. +# It won't be able to achieve "native" 64-bit performance, because in +# 32-bit application context every addc instruction will have to be +# expanded as addc, twice right shift by 32 and finally adde, etc. +# So far RSA *sign* performance improvement over pre-bn_mul_mont asm +# for 64-bit application running on PPC970/G5 is: +# +# 512-bit +65% +# 1024-bit +35% +# 2048-bit +18% +# 4096-bit +4% + +$output = shift; + +if ($output =~ /32\-mont\.s/) { + $BITS= 32; + $BNSZ= $BITS/8; + $SIZE_T=4; + $RZONE= 224; + $FRAME= $SIZE_T*16; + + $LD= "lwz"; # load + $LDU= "lwzu"; # load and update + $LDX= "lwzx"; # load indexed + $ST= "stw"; # store + $STU= "stwu"; # store and update + $STX= "stwx"; # store indexed + $STUX= "stwux"; # store indexed and update + $UMULL= "mullw"; # unsigned multiply low + $UMULH= "mulhwu"; # unsigned multiply high + $UCMP= "cmplw"; # unsigned compare + $PUSH= $ST; + $POP= $LD; +} elsif ($output =~ /64\-mont\.s/) { + $BITS= 64; + $BNSZ= $BITS/8; + $SIZE_T=8; + $RZONE= 288; + $FRAME= $SIZE_T*16; + + # same as above, but 64-bit mnemonics... + $LD= "ld"; # load + $LDU= "ldu"; # load and update + $LDX= "ldx"; # load indexed + $ST= "std"; # store + $STU= "stdu"; # store and update + $STX= "stdx"; # store indexed + $STUX= "stdux"; # store indexed and update + $UMULL= "mulld"; # unsigned multiply low + $UMULH= "mulhdu"; # unsigned multiply high + $UCMP= "cmpld"; # unsigned compare + $PUSH= $ST; + $POP= $LD; +} else { die "nonsense $output"; } + +( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) || + die "can't call ../perlasm/ppc-xlate.pl: $!"; + +$sp="r1"; +$toc="r2"; +$rp="r3"; $ovf="r3"; +$ap="r4"; +$bp="r5"; +$np="r6"; +$n0="r7"; +$num="r8"; +$rp="r9"; # $rp is reassigned +$aj="r10"; +$nj="r11"; +$tj="r12"; +# non-volatile registers +$i="r14"; +$j="r15"; +$tp="r16"; +$m0="r17"; +$m1="r18"; +$lo0="r19"; +$hi0="r20"; +$lo1="r21"; +$hi1="r22"; +$alo="r23"; +$ahi="r24"; +$nlo="r25"; +# +$nhi="r0"; + +$code=<<___; +.text + +.globl .bn_mul_mont +.align 4 +.bn_mul_mont: + cmpwi $num,4 + mr $rp,r3 ; $rp is reassigned + li r3,0 + bltlr + + slwi $num,$num,`log($BNSZ)/log(2)` + li $tj,-4096 + addi $ovf,$num,`$FRAME+$RZONE` + subf $ovf,$ovf,$sp ; $sp-$ovf + and $ovf,$ovf,$tj ; minimize TLB usage + subf $ovf,$sp,$ovf ; $ovf-$sp + srwi $num,$num,`log($BNSZ)/log(2)` + $STUX $sp,$sp,$ovf + + $PUSH r14,`4*$SIZE_T`($sp) + $PUSH r15,`5*$SIZE_T`($sp) + $PUSH r16,`6*$SIZE_T`($sp) + $PUSH r17,`7*$SIZE_T`($sp) + $PUSH r18,`8*$SIZE_T`($sp) + $PUSH r19,`9*$SIZE_T`($sp) + $PUSH r20,`10*$SIZE_T`($sp) + $PUSH r21,`11*$SIZE_T`($sp) + $PUSH r22,`12*$SIZE_T`($sp) + $PUSH r23,`13*$SIZE_T`($sp) + $PUSH r24,`14*$SIZE_T`($sp) + $PUSH r25,`15*$SIZE_T`($sp) + + $LD $n0,0($n0) ; pull n0[0] value + addi $num,$num,-2 ; adjust $num for counter register + + $LD $m0,0($bp) ; m0=bp[0] + $LD $aj,0($ap) ; ap[0] + addi $tp,$sp,$FRAME + $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] + $UMULH $hi0,$aj,$m0 + + $LD $aj,$BNSZ($ap) ; ap[1] + $LD $nj,0($np) ; np[0] + + $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 + + $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] + $UMULH $ahi,$aj,$m0 + + $UMULL $lo1,$nj,$m1 ; np[0]*m1 + $UMULH $hi1,$nj,$m1 + $LD $nj,$BNSZ($np) ; np[1] + addc $lo1,$lo1,$lo0 + addze $hi1,$hi1 + + $UMULL $nlo,$nj,$m1 ; np[1]*m1 + $UMULH $nhi,$nj,$m1 + + mtctr $num + li $j,`2*$BNSZ` +.align 4 +L1st: + $LDX $aj,$ap,$j ; ap[j] + $LDX $nj,$np,$j ; np[j] + addc $lo0,$alo,$hi0 + addze $hi0,$ahi + $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] + $UMULH $ahi,$aj,$m0 + + addc $lo1,$nlo,$hi1 + addze $hi1,$nhi + $UMULL $nlo,$nj,$m1 ; np[j]*m1 + $UMULH $nhi,$nj,$m1 + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] + addze $hi1,$hi1 + $ST $lo1,0($tp) ; tp[j-1] + + addi $j,$j,$BNSZ ; j++ + addi $tp,$tp,$BNSZ ; tp++ + bdnz- L1st +;L1st + addc $lo0,$alo,$hi0 + addze $hi0,$ahi + + addc $lo1,$nlo,$hi1 + addze $hi1,$nhi + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] + addze $hi1,$hi1 + $ST $lo1,0($tp) ; tp[j-1] + + li $ovf,0 + addc $hi1,$hi1,$hi0 + addze $ovf,$ovf ; upmost overflow bit + $ST $hi1,$BNSZ($tp) + + li $i,$BNSZ +.align 4 +Louter: + $LDX $m0,$bp,$i ; m0=bp[i] + $LD $aj,0($ap) ; ap[0] + addi $tp,$sp,$FRAME + $LD $tj,$FRAME($sp) ; tp[0] + $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] + $UMULH $hi0,$aj,$m0 + $LD $aj,$BNSZ($ap) ; ap[1] + $LD $nj,0($np) ; np[0] + addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] + addze $hi0,$hi0 + + $UMULL $m1,$lo0,$n0 ; tp[0]*n0 + + $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] + $UMULH $ahi,$aj,$m0 + + $UMULL $lo1,$nj,$m1 ; np[0]*m1 + $UMULH $hi1,$nj,$m1 + $LD $nj,$BNSZ($np) ; np[1] + addc $lo1,$lo1,$lo0 + addze $hi1,$hi1 + + $UMULL $nlo,$nj,$m1 ; np[1]*m1 + $UMULH $nhi,$nj,$m1 + + mtctr $num + li $j,`2*$BNSZ` +.align 4 +Linner: + $LDX $aj,$ap,$j ; ap[j] + $LD $tj,$BNSZ($tp) ; tp[j] + addc $lo0,$alo,$hi0 + addze $hi0,$ahi + $LDX $nj,$np,$j ; np[j] + addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] + addze $hi0,$hi0 + $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] + $UMULH $ahi,$aj,$m0 + + addc $lo1,$nlo,$hi1 + addze $hi1,$nhi + $UMULL $nlo,$nj,$m1 ; np[j]*m1 + $UMULH $nhi,$nj,$m1 + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] + addze $hi1,$hi1 + $ST $lo1,0($tp) ; tp[j-1] + + addi $j,$j,$BNSZ ; j++ + addi $tp,$tp,$BNSZ ; tp++ + bdnz- Linner +;Linner + $LD $tj,$BNSZ($tp) ; tp[j] + addc $lo0,$alo,$hi0 + addze $hi0,$ahi + addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] + addze $hi0,$hi0 + + addc $lo1,$nlo,$hi1 + addze $hi1,$nhi + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] + addze $hi1,$hi1 + $ST $lo1,0($tp) ; tp[j-1] + + addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] + li $ovf,0 + adde $hi1,$hi1,$hi0 + addze $ovf,$ovf + $ST $hi1,$BNSZ($tp) +; + slwi $tj,$num,`log($BNSZ)/log(2)` + $UCMP $i,$tj + addi $i,$i,$BNSZ + ble- Louter + + addi $num,$num,2 ; restore $num + addi $tp,$sp,$FRAME + mtctr $num + li $j,0 + + subfc. $ovf,$j,$ovf ; sets XER[CA] + bne Lsub + $UCMP $hi1,$nj + bge Lsub +.align 4 +Lcopy: + $LDX $tj,$tp,$j + $STX $tj,$rp,$j + $STX $j,$tp,$j ; zap at once + addi $j,$j,$BNSZ + bdnz- Lcopy + +Lexit: + $POP r14,`4*$SIZE_T`($sp) + $POP r15,`5*$SIZE_T`($sp) + $POP r16,`6*$SIZE_T`($sp) + $POP r17,`7*$SIZE_T`($sp) + $POP r18,`8*$SIZE_T`($sp) + $POP r19,`9*$SIZE_T`($sp) + $POP r20,`10*$SIZE_T`($sp) + $POP r21,`11*$SIZE_T`($sp) + $POP r22,`12*$SIZE_T`($sp) + $POP r23,`13*$SIZE_T`($sp) + $POP r24,`14*$SIZE_T`($sp) + $POP r25,`15*$SIZE_T`($sp) + $POP $sp,0($sp) + li r3,1 + blr + .long 0 +.align 4 +Lsub: $LDX $tj,$tp,$j + $LDX $nj,$np,$j + subfe $tj,$nj,$tj ; tp[j]-np[j] + $STX $tj,$rp,$j + addi $j,$j,$BNSZ + bdnz- Lsub + li $j,0 + subfe. $ovf,$j,$ovf + mtctr $num + bne Lcopy +.align 4 +Lzap: $STX $j,$tp,$j + addi $j,$j,$BNSZ + bdnz- Lzap + b Lexit +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl new file mode 100755 index 0000000000..31876e56b2 --- /dev/null +++ b/crypto/perlasm/ppc-xlate.pl @@ -0,0 +1,113 @@ +#!/usr/bin/env perl + +# PowerPC assembler distiller by . + +my $output = shift; +open STDOUT,">$output" || die "can't open $output: $!"; + +my $flavour = $output; +my %GLOBALS; +my $dotinlocallabels=0; + +################################################################ +# directives which need special treatment on different platforms +################################################################ +my $globl = sub { + my $junk = shift; + my $name = shift; + my $global = \$GLOBALS{$name}; + my $ret; + + $name =~ s|^[\.\_]||; + + SWITCH: for ($flavour) { + /aix/ && do { $name = ".$name"; + last; + }; + /osx/ && do { $name = "_$name"; + last; + }; + /linux.*32/ && do { $ret .= ".globl $name\n"; + $ret .= ".type $name,\@function"; + $dotinlocallabels = 1; + last; + }; + /linux.*64/ && do { $ret .= ".globl .$name\n"; + $ret .= ".type .$name,\@function\n"; + $ret .= ".section \".opd\",\"aw\"\n"; + $ret .= ".globl $name\n"; + $ret .= ".align 3\n"; + $ret .= "$name:\n"; + $ret .= ".quad .$name,.TOC.\@tocbase,0\n"; + $ret .= ".size $name,24\n"; + $ret .= ".previous\n"; + + $name = ".$name"; + $dotinlocallabels = 1; + last; + }; + } + + $ret = ".globl $name" if (!$ret); + $$global = $name; + $ret; +}; +my $machine = sub { + my $junk = shift; + my $arch = shift; + $arch = "ppc970" if ($arch eq "any" and $flavour =~ /osx/); + ".machine $arch"; +}; + +################################################################ +# simplified mnemonics not handled by at least one assembler +################################################################ +my $cmplw = sub { + my $f = shift; + my $cr = 0; $cr = shift if ($#_>1); + " cmpl$f ".join(',',$cr,0,@_); +}; +my $cmpld = sub { + my $f = shift; + my $cr = 0; $cr = shift if ($#_>1); + " cmpl$f ".join(',',$cr,1,@_); +}; +my $bdnz = sub { + my $f = shift; + my $bo = $f=~/[\+\-]/ ? 17 : 16; + " bc $bo,0,".shift; +}; + +while($line=<>) { + + $line =~ s|[#!;].*$||; # get rid of asm-style comments... + $line =~ s|/\*.*\*/||; # ... and C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning... + $line =~ s|\s+$||; # ... and at the end + + { + $line =~ s|\b\.L(\w+)|L$1|g; # common denominator for Locallabel + $line =~ s|\bL(\w+)|\.L$1|g if ($dotinlocallabels); + } + + { + $line =~ s|(^[\.\w]+)\:\s*||; + my $label = $1; + printf "%s:",($GLOBALS{$label} or $label) if ($label); + } + + { + $line =~ s|^\s*(\.?)(\w+)([\.\+\-]?)\s*||; + my $c = $1; $c = "\t" if ($c eq ""); + my $mnemonic = $2; + my $f = $3; + my $opcode = eval("\$$mnemonic"); + if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); } + elsif ($mnemonic) { $line = $c.$mnemonic.$f."\t".$line; } + } + + print $line if ($line); + print "\n"; +} + +close STDOUT; -- 2.34.1