Add ghash-parisc.pl.
authorAndy Polyakov <appro@openssl.org>
Wed, 28 Apr 2010 18:51:45 +0000 (18:51 +0000)
committerAndy Polyakov <appro@openssl.org>
Wed, 28 Apr 2010 18:51:45 +0000 (18:51 +0000)
Configure
TABLE
crypto/modes/Makefile
crypto/modes/asm/ghash-parisc.pl [new file with mode: 0644]

index 3b60587..d9abecd 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -135,8 +135,8 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-a
 my $mips3_asm=":bn-mips3.o:::::::::::::void";
 my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o::::::void";
 my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes_ctr.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o::::::::void";
-my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes_ctr.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o::::::32";
-my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes_ctr.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o::::::64";
+my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes_ctr.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:32";
+my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes_ctr.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:64";
 my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes_ctr.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o:::::::";
 my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes_ctr.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o:::::::";
 my $no_asm="::::::::::::::void";
@@ -292,6 +292,7 @@ my %table=(
 # Since there is mention of this in shlib/hpux10-cc.sh
 "hpux-parisc-cc-o4","cc:-Ae +O4 +ESlit -z -DB_ENDIAN -DBN_DIV2W -DMD32_XARRAY::-D_REENTRANT::-ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1:${no_asm}:dl:hpux-shared:+Z:-b:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "hpux-parisc-gcc","gcc:-O3 -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-Wl,+s -ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1:${no_asm}:dl:hpux-shared:-fPIC:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"hpux-parisc1_1-gcc","gcc:-O3 -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-Wl,+s -ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1:${parisc11_asm}:dl:hpux-shared:-fPIC:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "hpux-parisc2-gcc","gcc:-march=2.0 -O3 -DB_ENDIAN -D_REENTRANT::::-Wl,+s -ldld:SIXTY_FOUR_BIT RC4_CHAR RC4_CHUNK DES_PTR DES_UNROLL DES_RISC1::pa-risc2.o:::::::::::::void:dl:hpux-shared:-fPIC:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "hpux64-parisc2-gcc","gcc:-O3 -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::pa-risc2W.o:::::::::::::void:dlfcn:hpux-shared:-fpic:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/pa20_64",
 
diff --git a/TABLE b/TABLE
index 327391f..f7763f0 100644 (file)
--- a/TABLE
+++ b/TABLE
@@ -2933,7 +2933,7 @@ $rmd160_obj   =
 $rc5_obj      = 
 $wp_obj       = 
 $cmll_obj     = 
-$modes_obj    = 
+$modes_obj    = ghash-parisc.o
 $perlasm_scheme = 32
 $dso_scheme   = dl
 $shared_target= hpux-shared
@@ -2944,6 +2944,38 @@ $ranlib       =
 $arflags      = 
 $multilib     = /pa1.1
 
+*** hpux-parisc1_1-gcc
+$cc           = gcc
+$cflags       = -O3 -DB_ENDIAN -DBN_DIV2W
+$unistd       = 
+$thread_cflag = -D_REENTRANT
+$sys_id       = 
+$lflags       = -Wl,+s -ldld
+$bn_ops       = BN_LLONG DES_PTR DES_UNROLL DES_RISC1
+$cpuid_obj    = pariscid.o
+$bn_obj       = bn_asm.o parisc-mont.o
+$des_obj      = 
+$aes_obj      = aes_core.o aes_cbc.o aes_ctr.o aes-parisc.o
+$bf_obj       = 
+$md5_obj      = 
+$sha1_obj     = sha1-parisc.o sha256-parisc.o sha512-parisc.o
+$cast_obj     = 
+$rc4_obj      = rc4-parisc.o
+$rmd160_obj   = 
+$rc5_obj      = 
+$wp_obj       = 
+$cmll_obj     = 
+$modes_obj    = ghash-parisc.o
+$perlasm_scheme = 32
+$dso_scheme   = dl
+$shared_target= hpux-shared
+$shared_cflag = -fPIC
+$shared_ldflag = -shared
+$shared_extension = .sl.$(SHLIB_MAJOR).$(SHLIB_MINOR)
+$ranlib       = 
+$arflags      = 
+$multilib     = 
+
 *** hpux-parisc2-cc
 $cc           = cc
 $cflags       = +DA2.0 +DS2.0 +O3 +Optrs_strongly_typed -Ae +ESlit -DB_ENDIAN -DMD32_XARRAY -D_REENTRANT
@@ -3093,7 +3125,7 @@ $rmd160_obj   =
 $rc5_obj      = 
 $wp_obj       = 
 $cmll_obj     = 
-$modes_obj    = 
+$modes_obj    = ghash-parisc.o
 $perlasm_scheme = 64
 $dso_scheme   = dlfcn
 $shared_target= hpux-shared
index ea74c5c..4c0de95 100644 (file)
@@ -50,9 +50,11 @@ ghash-x86.s: asm/ghash-x86.pl
 ghash-x86_64.s:        asm/ghash-x86_64.pl
        $(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@
 ghash-sparcv9.s:       asm/ghash-sparcv9.pl
-       $(PERL) asm/ghash-sparcv8.pl $(CFLAGS) > $@
+       $(PERL) asm/ghash-sparcv9.pl $(CFLAGS) > $@
 ghash-alpha.s: asm/ghash-alpha.pl
        $(PERL) $< | $(CC) -E - | tee $@ > /dev/null
+ghash-parisc.s:        asm/ghash-parisc.pl
+       $($PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
 # GNU make "catch all"
 ghash-%.s:     asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $(CFLAGS) > $@
 
diff --git a/crypto/modes/asm/ghash-parisc.pl b/crypto/modes/asm/ghash-parisc.pl
new file mode 100644 (file)
index 0000000..8849f01
--- /dev/null
@@ -0,0 +1,730 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
+# it processes one byte in 19 cycles, which is more than twice as fast
+# as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for 8
+# cycles, but measured performance on PA-8600 system is ~9 cycles per
+# processed byte. This is ~2.2x faster than 64-bit code generated by
+# vendor compiler (which used to be very hard to beat:-).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+       $LEVEL          ="2.0W";
+       $SIZE_T         =8;
+       $FRAME_MARKER   =80;
+       $SAVED_RP       =16;
+       $PUSH           ="std";
+       $PUSHMA         ="std,ma";
+       $POP            ="ldd";
+       $POPMB          ="ldd,mb";
+       $NREGS          =6;
+} else {
+       $LEVEL          ="1.0"; #"\n\t.ALLOW\t2.0";
+       $SIZE_T         =4;
+       $FRAME_MARKER   =48;
+       $SAVED_RP       =20;
+       $PUSH           ="stw";
+       $PUSHMA         ="stwm";
+       $POP            ="ldw";
+       $POPMB          ="ldwm";
+       $NREGS          =11;
+}
+
+$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
+                               #                 [+ argument transfer]
+
+################# volatile registers
+$Xi="%r26";    # argument block
+$Htbl="%r25";
+$inp="%r24";
+$len="%r23";
+$Hhh=$Htbl;    # variables
+$Hll="%r22";
+$Zhh="%r21";
+$Zll="%r20";
+$cnt="%r19";
+$rem_4bit="%r28";
+$rem="%r29";
+$mask0xf0="%r31";
+
+################# preserved registers
+$Thh="%r1";
+$Tll="%r2";
+$nlo="%r3";
+$nhi="%r4";
+$byte="%r5";
+if ($SIZE_T==4) {
+       $Zhl="%r6";
+       $Zlh="%r7";
+       $Hhl="%r8";
+       $Hlh="%r9";
+       $Thl="%r10";
+       $Tlh="%r11";
+}
+$rem2="%r6";   # used in PA-RISC 2.0 code
+
+$code.=<<___;
+       .LEVEL  $LEVEL
+       .SPACE  \$TEXT\$
+       .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+       .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
+       .ALIGN  64
+gcm_gmult_4bit
+       .PROC
+       .CALLINFO       FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
+       .ENTRY
+       $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
+       $PUSHMA %r3,$FRAME(%sp)
+       $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
+       $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
+       $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+       $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
+       $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
+       $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
+       $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
+       $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+       blr     %r0,$rem_4bit
+       ldi     3,$rem
+L\$pic_gmult
+       andcm   $rem_4bit,$rem,$rem_4bit
+       addl    $inp,$len,$len
+       ldo     L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
+       ldi     0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+       ldi     31,$rem
+       mtctl   $rem,%cr11
+       extrd,u,*= $rem,%sar,1,$rem     ; executes on PA-RISC 1.0
+       b       L\$parisc1_gmult
+       nop
+___
+\f
+$code.=<<___;
+       ldb     15($Xi),$nlo
+       ldo     8($Htbl),$Hll
+
+       and     $mask0xf0,$nlo,$nhi
+       depd,z  $nlo,59,4,$nlo
+
+       ldd     $nlo($Hll),$Zll
+       ldd     $nlo($Hhh),$Zhh
+
+       depd,z  $Zll,60,4,$rem
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldb     14($Xi),$nlo
+
+       ldd     $nhi($Hll),$Tll
+       ldd     $nhi($Hhh),$Thh
+       and     $mask0xf0,$nlo,$nhi
+       depd,z  $nlo,59,4,$nlo
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldd     $rem($rem_4bit),$rem
+       b       L\$oop_gmult_pa2
+       ldi     13,$cnt
+
+       .ALIGN  8
+L\$oop_gmult_pa2
+       xor     $rem,$Zhh,$Zhh          ; moved here to work around gas bug
+       depd,z  $Zll,60,4,$rem
+
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldd     $nlo($Hll),$Tll
+       ldd     $nlo($Hhh),$Thh
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldd     $rem($rem_4bit),$rem
+
+       xor     $rem,$Zhh,$Zhh
+       depd,z  $Zll,60,4,$rem
+       ldbx    $cnt($Xi),$nlo
+
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldd     $nhi($Hll),$Tll
+       ldd     $nhi($Hhh),$Thh
+
+       and     $mask0xf0,$nlo,$nhi
+       depd,z  $nlo,59,4,$nlo
+       ldd     $rem($rem_4bit),$rem
+
+       xor     $Tll,$Zll,$Zll
+       addib,uv -1,$cnt,L\$oop_gmult_pa2
+       xor     $Thh,$Zhh,$Zhh
+
+       xor     $rem,$Zhh,$Zhh
+       depd,z  $Zll,60,4,$rem
+
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldd     $nlo($Hll),$Tll
+       ldd     $nlo($Hhh),$Thh
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldd     $rem($rem_4bit),$rem
+
+       xor     $rem,$Zhh,$Zhh
+       depd,z  $Zll,60,4,$rem
+
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldd     $nhi($Hll),$Tll
+       ldd     $nhi($Hhh),$Thh
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldd     $rem($rem_4bit),$rem
+
+       xor     $rem,$Zhh,$Zhh
+       std     $Zll,8($Xi)
+       std     $Zhh,0($Xi)
+___
+\f
+$code.=<<___ if ($SIZE_T==4);
+       b       L\$done_gmult
+       nop
+
+L\$parisc1_gmult
+       ldb     15($Xi),$nlo
+       ldo     12($Htbl),$Hll
+       ldo     8($Htbl),$Hlh
+       ldo     4($Htbl),$Hhl
+
+       and     $mask0xf0,$nlo,$nhi
+       zdep    $nlo,27,4,$nlo
+
+       ldwx    $nlo($Hll),$Zll
+       ldwx    $nlo($Hlh),$Zlh
+       ldwx    $nlo($Hhl),$Zhl
+       ldwx    $nlo($Hhh),$Zhh
+       zdep    $Zll,28,4,$rem
+       ldb     14($Xi),$nlo
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $nhi($Hll),$Tll
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       ldwx    $nhi($Hlh),$Tlh
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       ldwx    $nhi($Hhl),$Thl
+       extru   $Zhh,27,28,$Zhh
+       ldwx    $nhi($Hhh),$Thh
+       xor     $rem,$Zhh,$Zhh
+       and     $mask0xf0,$nlo,$nhi
+       zdep    $nlo,27,4,$nlo
+
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nlo($Hll),$Tll
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nlo($Hlh),$Tlh
+       xor     $Thl,$Zhl,$Zhl
+       b       L\$oop_gmult_pa1
+       ldi     13,$cnt
+
+       .ALIGN  8
+L\$oop_gmult_pa1
+       zdep    $Zll,28,4,$rem
+       ldwx    $nlo($Hhl),$Thl
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $nlo($Hhh),$Thh
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       ldbx    $cnt($Xi),$nlo
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nhi($Hll),$Tll
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nhi($Hlh),$Tlh
+       extru   $Zhh,27,28,$Zhh
+       xor     $Thl,$Zhl,$Zhl
+       ldwx    $nhi($Hhl),$Thl
+       xor     $rem,$Zhh,$Zhh
+       zdep    $Zll,28,4,$rem
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $nhi($Hhh),$Thh
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       and     $mask0xf0,$nlo,$nhi
+       extru   $Zhh,27,28,$Zhh
+       zdep    $nlo,27,4,$nlo
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nlo($Hll),$Tll
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nlo($Hlh),$Tlh
+       xor     $rem,$Zhh,$Zhh
+       addib,uv -1,$cnt,L\$oop_gmult_pa1
+       xor     $Thl,$Zhl,$Zhl
+
+       zdep    $Zll,28,4,$rem
+       ldwx    $nlo($Hhl),$Thl
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $nlo($Hhh),$Thh
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nhi($Hll),$Tll
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nhi($Hlh),$Tlh
+       extru   $Zhh,27,28,$Zhh
+       xor     $rem,$Zhh,$Zhh
+       xor     $Thl,$Zhl,$Zhl
+       ldwx    $nhi($Hhl),$Thl
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $nhi($Hhh),$Thh
+       zdep    $Zll,28,4,$rem
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       extru   $Zhh,27,28,$Zhh
+       xor     $Tll,$Zll,$Zll
+       xor     $Tlh,$Zlh,$Zlh
+       xor     $rem,$Zhh,$Zhh
+       stw     $Zll,12($Xi)
+       xor     $Thl,$Zhl,$Zhl
+       stw     $Zlh,8($Xi)
+       xor     $Thh,$Zhh,$Zhh
+       stw     $Zhl,4($Xi)
+       stw     $Zhh,0($Xi)
+___
+$code.=<<___;
+L\$done_gmult
+       $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
+       $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
+       $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
+       $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+       $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
+       $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
+       $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
+       $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
+       $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+       bv      (%r2)
+       .EXIT
+       $POPMB  -$FRAME(%sp),%r3
+       .PROCEND
+
+       .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+       .ALIGN  64
+gcm_ghash_4bit
+       .PROC
+       .CALLINFO       FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
+       .ENTRY
+       $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
+       $PUSHMA %r3,$FRAME(%sp)
+       $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
+       $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
+       $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+       $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
+       $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
+       $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
+       $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
+       $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+       blr     %r0,$rem_4bit
+       ldi     3,$rem
+L\$pic_ghash
+       andcm   $rem_4bit,$rem,$rem_4bit
+       addl    $inp,$len,$len
+       ldo     L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
+       ldi     0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+       ldi     31,$rem
+       mtctl   $rem,%cr11
+       extrd,u,*= $rem,%sar,1,$rem     ; executes on PA-RISC 1.0
+       b       L\$parisc1_ghash
+       nop
+___
+\f\f
+$code.=<<___;
+       ldb     15($Xi),$nlo
+       ldo     8($Htbl),$Hll
+
+L\$outer_ghash_pa2
+       ldb     15($inp),$nhi
+       xor     $nhi,$nlo,$nlo
+       and     $mask0xf0,$nlo,$nhi
+       depd,z  $nlo,59,4,$nlo
+
+       ldd     $nlo($Hll),$Zll
+       ldd     $nlo($Hhh),$Zhh
+
+       depd,z  $Zll,60,4,$rem
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldb     14($Xi),$nlo
+       ldb     14($inp),$byte
+
+       ldd     $nhi($Hll),$Tll
+       ldd     $nhi($Hhh),$Thh
+       xor     $byte,$nlo,$nlo
+       and     $mask0xf0,$nlo,$nhi
+       depd,z  $nlo,59,4,$nlo
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldd     $rem($rem_4bit),$rem
+       b       L\$oop_ghash_pa2
+       ldi     13,$cnt
+
+       .ALIGN  8
+L\$oop_ghash_pa2
+       xor     $rem,$Zhh,$Zhh          ; moved here to work around gas bug
+       depd,z  $Zll,60,4,$rem2
+
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldd     $nlo($Hll),$Tll
+       ldd     $nlo($Hhh),$Thh
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldbx    $cnt($Xi),$nlo
+       ldbx    $cnt($inp),$byte
+
+       depd,z  $Zll,60,4,$rem
+       shrpd   $Zhh,$Zll,4,$Zll
+       ldd     $rem2($rem_4bit),$rem2
+
+       xor     $rem2,$Zhh,$Zhh
+       xor     $byte,$nlo,$nlo
+       ldd     $nhi($Hll),$Tll
+       ldd     $nhi($Hhh),$Thh
+
+       and     $mask0xf0,$nlo,$nhi
+       depd,z  $nlo,59,4,$nlo
+
+       extrd,u $Zhh,59,60,$Zhh
+       xor     $Tll,$Zll,$Zll
+
+       ldd     $rem($rem_4bit),$rem
+       addib,uv -1,$cnt,L\$oop_ghash_pa2
+       xor     $Thh,$Zhh,$Zhh
+
+       xor     $rem,$Zhh,$Zhh
+       depd,z  $Zll,60,4,$rem2
+
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldd     $nlo($Hll),$Tll
+       ldd     $nlo($Hhh),$Thh
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+
+       depd,z  $Zll,60,4,$rem
+       shrpd   $Zhh,$Zll,4,$Zll
+       ldd     $rem2($rem_4bit),$rem2
+
+       xor     $rem2,$Zhh,$Zhh
+       ldd     $nhi($Hll),$Tll
+       ldd     $nhi($Hhh),$Thh
+
+       extrd,u $Zhh,59,60,$Zhh
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldd     $rem($rem_4bit),$rem
+
+       xor     $rem,$Zhh,$Zhh
+       std     $Zll,8($Xi)
+       ldo     16($inp),$inp
+       std     $Zhh,0($Xi)
+       cmpb,*<> $inp,$len,L\$outer_ghash_pa2
+       copy    $Zll,$nlo
+___
+\f
+$code.=<<___ if ($SIZE_T==4);
+       b       L\$done_ghash
+       nop
+
+L\$parisc1_ghash
+       ldb     15($Xi),$nlo
+       ldo     12($Htbl),$Hll
+       ldo     8($Htbl),$Hlh
+       ldo     4($Htbl),$Hhl
+
+L\$outer_ghash_pa1
+       ldb     15($inp),$byte
+       xor     $byte,$nlo,$nlo
+       and     $mask0xf0,$nlo,$nhi
+       zdep    $nlo,27,4,$nlo
+
+       ldwx    $nlo($Hll),$Zll
+       ldwx    $nlo($Hlh),$Zlh
+       ldwx    $nlo($Hhl),$Zhl
+       ldwx    $nlo($Hhh),$Zhh
+       zdep    $Zll,28,4,$rem
+       ldb     14($Xi),$nlo
+       ldb     14($inp),$byte
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $nhi($Hll),$Tll
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       ldwx    $nhi($Hlh),$Tlh
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       ldwx    $nhi($Hhl),$Thl
+       extru   $Zhh,27,28,$Zhh
+       ldwx    $nhi($Hhh),$Thh
+       xor     $byte,$nlo,$nlo
+       xor     $rem,$Zhh,$Zhh
+       and     $mask0xf0,$nlo,$nhi
+       zdep    $nlo,27,4,$nlo
+
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nlo($Hll),$Tll
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nlo($Hlh),$Tlh
+       xor     $Thl,$Zhl,$Zhl
+       b       L\$oop_ghash_pa1
+       ldi     13,$cnt
+
+       .ALIGN  8
+L\$oop_ghash_pa1
+       zdep    $Zll,28,4,$rem
+       ldwx    $nlo($Hhl),$Thl
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $nlo($Hhh),$Thh
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       ldbx    $cnt($Xi),$nlo
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nhi($Hll),$Tll
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       ldbx    $cnt($inp),$byte
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nhi($Hlh),$Tlh
+       extru   $Zhh,27,28,$Zhh
+       xor     $Thl,$Zhl,$Zhl
+       ldwx    $nhi($Hhl),$Thl
+       xor     $rem,$Zhh,$Zhh
+       zdep    $Zll,28,4,$rem
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $nhi($Hhh),$Thh
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       xor     $byte,$nlo,$nlo
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       and     $mask0xf0,$nlo,$nhi
+       extru   $Zhh,27,28,$Zhh
+       zdep    $nlo,27,4,$nlo
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nlo($Hll),$Tll
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nlo($Hlh),$Tlh
+       xor     $rem,$Zhh,$Zhh
+       addib,uv -1,$cnt,L\$oop_ghash_pa1
+       xor     $Thl,$Zhl,$Zhl
+
+       zdep    $Zll,28,4,$rem
+       ldwx    $nlo($Hhl),$Thl
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $nlo($Hhh),$Thh
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nhi($Hll),$Tll
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nhi($Hlh),$Tlh
+       extru   $Zhh,27,28,$Zhh
+       xor     $rem,$Zhh,$Zhh
+       xor     $Thl,$Zhl,$Zhl
+       ldwx    $nhi($Hhl),$Thl
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $nhi($Hhh),$Thh
+       zdep    $Zll,28,4,$rem
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       extru   $Zhh,27,28,$Zhh
+       xor     $Tll,$Zll,$Zll
+       xor     $Tlh,$Zlh,$Zlh
+       xor     $rem,$Zhh,$Zhh
+       stw     $Zll,12($Xi)
+       xor     $Thl,$Zhl,$Zhl
+       stw     $Zlh,8($Xi)
+       xor     $Thh,$Zhh,$Zhh
+       stw     $Zhl,4($Xi)
+       ldo     16($inp),$inp
+       stw     $Zhh,0($Xi)
+       comb,<> $inp,$len,L\$outer_ghash_pa1
+       copy    $Zll,$nlo
+___
+$code.=<<___;
+L\$done_ghash
+       $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
+       $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
+       $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
+       $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+       $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
+       $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
+       $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
+       $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
+       $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+       bv      (%r2)
+       .EXIT
+       $POPMB  -$FRAME(%sp),%r3
+       .PROCEND
+
+       .ALIGN  64
+L\$rem_4bit
+       .WORD   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+       .WORD   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+       .WORD   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+       .WORD   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+       .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
+       .ALIGN  64
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)                # format 4
+    {  my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)    # format 5
+    {  my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+       $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);                # encode offset
+       $opcode|=(1<<5)  if ($mod =~ /^,m/);
+       $opcode|=(1<<13) if ($mod =~ /^,mb/);
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+    {  my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)    # format 15
+    {  my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+       my $len=32-$3;
+       $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
+       $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)     # format 12
+    {  my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+       my $len=32-$2;
+       $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
+       $opcode |= (1<<13) if ($mod =~ /,\**=/);
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)  # format 14
+    {  my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+       my $cpos=63-$3;
+       $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)   # format 11
+    {  sprintf "\t.WORD\t0x%08x\t; %s",
+               (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $depd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "depd$mod\t$args";
+
+    # I only have ",z" completer, it's impicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)    # format 16
+    {  my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
+       my $cpos=63-$2;
+       my $len=32-$3;
+       $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode pos
+       $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/ge;
+       if ($SIZE_T==4) {
+               s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
+               s/cmpb,\*/comb,/;
+               s/,\*/,/;
+       }
+       print $_,"\n";
+}
+
+close STDOUT;