From 1a111921daec7e3447d0706dafa0e9686bdf0097 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 14 Nov 2011 20:50:15 +0000 Subject: [PATCH] PA-RISC assembler pack update from HEAD. --- crypto/aes/asm/aes-parisc.pl | 1021 +++++++++++++++++++++++++++++++ crypto/bn/asm/parisc-mont.pl | 993 ++++++++++++++++++++++++++++++ crypto/pariscid.pl | 224 +++++++ crypto/rc4/asm/rc4-parisc.pl | 313 ++++++++++ crypto/sha/asm/sha1-parisc.pl | 259 ++++++++ crypto/sha/asm/sha512-parisc.pl | 791 ++++++++++++++++++++++++ 6 files changed, 3601 insertions(+) create mode 100644 crypto/aes/asm/aes-parisc.pl create mode 100644 crypto/bn/asm/parisc-mont.pl create mode 100644 crypto/pariscid.pl create mode 100644 crypto/rc4/asm/rc4-parisc.pl create mode 100644 crypto/sha/asm/sha1-parisc.pl create mode 100755 crypto/sha/asm/sha512-parisc.pl diff --git a/crypto/aes/asm/aes-parisc.pl b/crypto/aes/asm/aes-parisc.pl new file mode 100644 index 0000000000..c36b6a2270 --- /dev/null +++ b/crypto/aes/asm/aes-parisc.pl @@ -0,0 +1,1021 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# AES for PA-RISC. +# +# June 2009. +# +# The module is mechanical transliteration of aes-sparcv9.pl, but with +# a twist: S-boxes are compressed even further down to 1K+256B. On +# PA-7100LC performance is ~40% better than gcc 3.2 generated code and +# is about 33 cycles per byte processed with 128-bit key. Newer CPUs +# perform at 16 cycles per byte. It's not faster than code generated +# by vendor compiler, but recall that it has compressed S-boxes, which +# requires extra processing. +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker + # [+ argument transfer] +$inp="%r26"; # arg0 +$out="%r25"; # arg1 +$key="%r24"; # arg2 + +($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4"); +($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8"); + +($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7, + $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) = +("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16", +"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26"); + +$tbl="%r28"; +$rounds="%r29"; + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 64 +AES_encrypt + .PROC + .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) + $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) + + blr %r0,$tbl + ldi 3,$t0 +L\$enc_pic + andcm $tbl,$t0,$tbl + ldo L\$AES_Te-L\$enc_pic($tbl),$tbl + + and $inp,$t0,$t0 + sub $inp,$t0,$inp + ldw 0($inp),$s0 + ldw 4($inp),$s1 + ldw 8($inp),$s2 + comib,= 0,$t0,L\$enc_inp_aligned + ldw 12($inp),$s3 + + sh3addl $t0,%r0,$t0 + subi 32,$t0,$t0 + mtctl $t0,%cr11 + ldw 16($inp),$t1 + vshd $s0,$s1,$s0 + vshd $s1,$s2,$s1 + vshd $s2,$s3,$s2 + vshd $s3,$t1,$s3 + +L\$enc_inp_aligned + bl _parisc_AES_encrypt,%r31 + nop + + extru,<> $out,31,2,%r0 + b L\$enc_out_aligned + nop + + _srm $s0,24,$acc0 + _srm $s0,16,$acc1 + stb $acc0,0($out) + _srm $s0,8,$acc2 + stb $acc1,1($out) + _srm $s1,24,$acc4 + stb $acc2,2($out) + _srm $s1,16,$acc5 + stb $s0,3($out) + _srm $s1,8,$acc6 + stb $acc4,4($out) + _srm $s2,24,$acc0 + stb $acc5,5($out) + _srm $s2,16,$acc1 + stb $acc6,6($out) + _srm $s2,8,$acc2 + stb $s1,7($out) + _srm $s3,24,$acc4 + stb $acc0,8($out) + _srm $s3,16,$acc5 + stb $acc1,9($out) + _srm $s3,8,$acc6 + stb $acc2,10($out) + stb $s2,11($out) + stb $acc4,12($out) + stb $acc5,13($out) + stb $acc6,14($out) + b L\$enc_done + stb $s3,15($out) + +L\$enc_out_aligned + stw $s0,0($out) + stw $s1,4($out) + stw $s2,8($out) + stw $s3,12($out) + +L\$enc_done + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 + $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + + .ALIGN 16 +_parisc_AES_encrypt + .PROC + .CALLINFO MILLICODE + .ENTRY + ldw 240($key),$rounds + ldw 0($key),$t0 + ldw 4($key),$t1 + ldw 8($key),$t2 + _srm $rounds,1,$rounds + xor $t0,$s0,$s0 + ldw 12($key),$t3 + _srm $s0,24,$acc0 + xor $t1,$s1,$s1 + ldw 16($key),$t0 + _srm $s1,16,$acc1 + xor $t2,$s2,$s2 + ldw 20($key),$t1 + xor $t3,$s3,$s3 + ldw 24($key),$t2 + ldw 28($key),$t3 +L\$enc_loop + _srm $s2,8,$acc2 + ldwx,s $acc0($tbl),$acc0 + _srm $s3,0,$acc3 + ldwx,s $acc1($tbl),$acc1 + _srm $s1,24,$acc4 + ldwx,s $acc2($tbl),$acc2 + _srm $s2,16,$acc5 + ldwx,s $acc3($tbl),$acc3 + _srm $s3,8,$acc6 + ldwx,s $acc4($tbl),$acc4 + _srm $s0,0,$acc7 + ldwx,s $acc5($tbl),$acc5 + _srm $s2,24,$acc8 + ldwx,s $acc6($tbl),$acc6 + _srm $s3,16,$acc9 + ldwx,s $acc7($tbl),$acc7 + _srm $s0,8,$acc10 + ldwx,s $acc8($tbl),$acc8 + _srm $s1,0,$acc11 + ldwx,s $acc9($tbl),$acc9 + _srm $s3,24,$acc12 + ldwx,s $acc10($tbl),$acc10 + _srm $s0,16,$acc13 + ldwx,s $acc11($tbl),$acc11 + _srm $s1,8,$acc14 + ldwx,s $acc12($tbl),$acc12 + _srm $s2,0,$acc15 + ldwx,s $acc13($tbl),$acc13 + ldwx,s $acc14($tbl),$acc14 + ldwx,s $acc15($tbl),$acc15 + addib,= -1,$rounds,L\$enc_last + ldo 32($key),$key + + _ror $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ldw 0($key),$s0 + _ror $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ldw 4($key),$s1 + _ror $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ldw 8($key),$s2 + _ror $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ldw 12($key),$s3 + _ror $acc6,16,$acc6 + xor $acc4,$t1,$t1 + _ror $acc7,24,$acc7 + xor $acc5,$t1,$t1 + _ror $acc9,8,$acc9 + xor $acc6,$t1,$t1 + _ror $acc10,16,$acc10 + xor $acc7,$t1,$t1 + _ror $acc11,24,$acc11 + xor $acc8,$t2,$t2 + _ror $acc13,8,$acc13 + xor $acc9,$t2,$t2 + _ror $acc14,16,$acc14 + xor $acc10,$t2,$t2 + _ror $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + _srm $t0,24,$acc0 + xor $acc14,$t3,$t3 + _srm $t1,16,$acc1 + xor $acc15,$t3,$t3 + + _srm $t2,8,$acc2 + ldwx,s $acc0($tbl),$acc0 + _srm $t3,0,$acc3 + ldwx,s $acc1($tbl),$acc1 + _srm $t1,24,$acc4 + ldwx,s $acc2($tbl),$acc2 + _srm $t2,16,$acc5 + ldwx,s $acc3($tbl),$acc3 + _srm $t3,8,$acc6 + ldwx,s $acc4($tbl),$acc4 + _srm $t0,0,$acc7 + ldwx,s $acc5($tbl),$acc5 + _srm $t2,24,$acc8 + ldwx,s $acc6($tbl),$acc6 + _srm $t3,16,$acc9 + ldwx,s $acc7($tbl),$acc7 + _srm $t0,8,$acc10 + ldwx,s $acc8($tbl),$acc8 + _srm $t1,0,$acc11 + ldwx,s $acc9($tbl),$acc9 + _srm $t3,24,$acc12 + ldwx,s $acc10($tbl),$acc10 + _srm $t0,16,$acc13 + ldwx,s $acc11($tbl),$acc11 + _srm $t1,8,$acc14 + ldwx,s $acc12($tbl),$acc12 + _srm $t2,0,$acc15 + ldwx,s $acc13($tbl),$acc13 + _ror $acc1,8,$acc1 + ldwx,s $acc14($tbl),$acc14 + + _ror $acc2,16,$acc2 + xor $acc0,$s0,$s0 + ldwx,s $acc15($tbl),$acc15 + _ror $acc3,24,$acc3 + xor $acc1,$s0,$s0 + ldw 16($key),$t0 + _ror $acc5,8,$acc5 + xor $acc2,$s0,$s0 + ldw 20($key),$t1 + _ror $acc6,16,$acc6 + xor $acc3,$s0,$s0 + ldw 24($key),$t2 + _ror $acc7,24,$acc7 + xor $acc4,$s1,$s1 + ldw 28($key),$t3 + _ror $acc9,8,$acc9 + xor $acc5,$s1,$s1 + ldw 1024+0($tbl),%r0 ; prefetch te4 + _ror $acc10,16,$acc10 + xor $acc6,$s1,$s1 + ldw 1024+32($tbl),%r0 ; prefetch te4 + _ror $acc11,24,$acc11 + xor $acc7,$s1,$s1 + ldw 1024+64($tbl),%r0 ; prefetch te4 + _ror $acc13,8,$acc13 + xor $acc8,$s2,$s2 + ldw 1024+96($tbl),%r0 ; prefetch te4 + _ror $acc14,16,$acc14 + xor $acc9,$s2,$s2 + ldw 1024+128($tbl),%r0 ; prefetch te4 + _ror $acc15,24,$acc15 + xor $acc10,$s2,$s2 + ldw 1024+160($tbl),%r0 ; prefetch te4 + _srm $s0,24,$acc0 + xor $acc11,$s2,$s2 + ldw 1024+192($tbl),%r0 ; prefetch te4 + xor $acc12,$acc14,$acc14 + xor $acc13,$s3,$s3 + ldw 1024+224($tbl),%r0 ; prefetch te4 + _srm $s1,16,$acc1 + xor $acc14,$s3,$s3 + b L\$enc_loop + xor $acc15,$s3,$s3 + + .ALIGN 16 +L\$enc_last + ldo 1024($tbl),$rounds + _ror $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ldw 0($key),$s0 + _ror $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ldw 4($key),$s1 + _ror $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ldw 8($key),$s2 + _ror $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ldw 12($key),$s3 + _ror $acc6,16,$acc6 + xor $acc4,$t1,$t1 + _ror $acc7,24,$acc7 + xor $acc5,$t1,$t1 + _ror $acc9,8,$acc9 + xor $acc6,$t1,$t1 + _ror $acc10,16,$acc10 + xor $acc7,$t1,$t1 + _ror $acc11,24,$acc11 + xor $acc8,$t2,$t2 + _ror $acc13,8,$acc13 + xor $acc9,$t2,$t2 + _ror $acc14,16,$acc14 + xor $acc10,$t2,$t2 + _ror $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + _srm $t0,24,$acc0 + xor $acc14,$t3,$t3 + _srm $t1,16,$acc1 + xor $acc15,$t3,$t3 + + _srm $t2,8,$acc2 + ldbx $acc0($rounds),$acc0 + _srm $t1,24,$acc4 + ldbx $acc1($rounds),$acc1 + _srm $t2,16,$acc5 + _srm $t3,0,$acc3 + ldbx $acc2($rounds),$acc2 + ldbx $acc3($rounds),$acc3 + _srm $t3,8,$acc6 + ldbx $acc4($rounds),$acc4 + _srm $t2,24,$acc8 + ldbx $acc5($rounds),$acc5 + _srm $t3,16,$acc9 + _srm $t0,0,$acc7 + ldbx $acc6($rounds),$acc6 + ldbx $acc7($rounds),$acc7 + _srm $t0,8,$acc10 + ldbx $acc8($rounds),$acc8 + _srm $t3,24,$acc12 + ldbx $acc9($rounds),$acc9 + _srm $t0,16,$acc13 + _srm $t1,0,$acc11 + ldbx $acc10($rounds),$acc10 + _srm $t1,8,$acc14 + ldbx $acc11($rounds),$acc11 + ldbx $acc12($rounds),$acc12 + ldbx $acc13($rounds),$acc13 + _srm $t2,0,$acc15 + ldbx $acc14($rounds),$acc14 + + dep $acc0,7,8,$acc3 + ldbx $acc15($rounds),$acc15 + dep $acc4,7,8,$acc7 + dep $acc1,15,8,$acc3 + dep $acc5,15,8,$acc7 + dep $acc2,23,8,$acc3 + dep $acc6,23,8,$acc7 + xor $acc3,$s0,$s0 + xor $acc7,$s1,$s1 + dep $acc8,7,8,$acc11 + dep $acc12,7,8,$acc15 + dep $acc9,15,8,$acc11 + dep $acc13,15,8,$acc15 + dep $acc10,23,8,$acc11 + dep $acc14,23,8,$acc15 + xor $acc11,$s2,$s2 + + bv (%r31) + .EXIT + xor $acc15,$s3,$s3 + .PROCEND + + .ALIGN 64 +L\$AES_Te + .WORD 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d + .WORD 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 + .WORD 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d + .WORD 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a + .WORD 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 + .WORD 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b + .WORD 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea + .WORD 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b + .WORD 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a + .WORD 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f + .WORD 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 + .WORD 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f + .WORD 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e + .WORD 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 + .WORD 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d + .WORD 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f + .WORD 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e + .WORD 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb + .WORD 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce + .WORD 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 + .WORD 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c + .WORD 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed + .WORD 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b + .WORD 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a + .WORD 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 + .WORD 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 + .WORD 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 + .WORD 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 + .WORD 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a + .WORD 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 + .WORD 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 + .WORD 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d + .WORD 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f + .WORD 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 + .WORD 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 + .WORD 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 + .WORD 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f + .WORD 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 + .WORD 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c + .WORD 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 + .WORD 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e + .WORD 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 + .WORD 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 + .WORD 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b + .WORD 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 + .WORD 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 + .WORD 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 + .WORD 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 + .WORD 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 + .WORD 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 + .WORD 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 + .WORD 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 + .WORD 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa + .WORD 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 + .WORD 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 + .WORD 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 + .WORD 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 + .WORD 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 + .WORD 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 + .WORD 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a + .WORD 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 + .WORD 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 + .WORD 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 + .WORD 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a + .BYTE 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 + .BYTE 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 + .BYTE 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 + .BYTE 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 + .BYTE 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc + .BYTE 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 + .BYTE 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a + .BYTE 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 + .BYTE 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 + .BYTE 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 + .BYTE 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b + .BYTE 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf + .BYTE 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 + .BYTE 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 + .BYTE 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 + .BYTE 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 + .BYTE 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 + .BYTE 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 + .BYTE 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 + .BYTE 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb + .BYTE 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c + .BYTE 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 + .BYTE 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 + .BYTE 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 + .BYTE 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 + .BYTE 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a + .BYTE 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e + .BYTE 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e + .BYTE 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 + .BYTE 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf + .BYTE 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 + .BYTE 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +___ + +$code.=<<___; + .EXPORT AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 16 +AES_decrypt + .PROC + .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) + $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) + + blr %r0,$tbl + ldi 3,$t0 +L\$dec_pic + andcm $tbl,$t0,$tbl + ldo L\$AES_Td-L\$dec_pic($tbl),$tbl + + and $inp,$t0,$t0 + sub $inp,$t0,$inp + ldw 0($inp),$s0 + ldw 4($inp),$s1 + ldw 8($inp),$s2 + comib,= 0,$t0,L\$dec_inp_aligned + ldw 12($inp),$s3 + + sh3addl $t0,%r0,$t0 + subi 32,$t0,$t0 + mtctl $t0,%cr11 + ldw 16($inp),$t1 + vshd $s0,$s1,$s0 + vshd $s1,$s2,$s1 + vshd $s2,$s3,$s2 + vshd $s3,$t1,$s3 + +L\$dec_inp_aligned + bl _parisc_AES_decrypt,%r31 + nop + + extru,<> $out,31,2,%r0 + b L\$dec_out_aligned + nop + + _srm $s0,24,$acc0 + _srm $s0,16,$acc1 + stb $acc0,0($out) + _srm $s0,8,$acc2 + stb $acc1,1($out) + _srm $s1,24,$acc4 + stb $acc2,2($out) + _srm $s1,16,$acc5 + stb $s0,3($out) + _srm $s1,8,$acc6 + stb $acc4,4($out) + _srm $s2,24,$acc0 + stb $acc5,5($out) + _srm $s2,16,$acc1 + stb $acc6,6($out) + _srm $s2,8,$acc2 + stb $s1,7($out) + _srm $s3,24,$acc4 + stb $acc0,8($out) + _srm $s3,16,$acc5 + stb $acc1,9($out) + _srm $s3,8,$acc6 + stb $acc2,10($out) + stb $s2,11($out) + stb $acc4,12($out) + stb $acc5,13($out) + stb $acc6,14($out) + b L\$dec_done + stb $s3,15($out) + +L\$dec_out_aligned + stw $s0,0($out) + stw $s1,4($out) + stw $s2,8($out) + stw $s3,12($out) + +L\$dec_done + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 + $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + + .ALIGN 16 +_parisc_AES_decrypt + .PROC + .CALLINFO MILLICODE + .ENTRY + ldw 240($key),$rounds + ldw 0($key),$t0 + ldw 4($key),$t1 + ldw 8($key),$t2 + ldw 12($key),$t3 + _srm $rounds,1,$rounds + xor $t0,$s0,$s0 + ldw 16($key),$t0 + xor $t1,$s1,$s1 + ldw 20($key),$t1 + _srm $s0,24,$acc0 + xor $t2,$s2,$s2 + ldw 24($key),$t2 + xor $t3,$s3,$s3 + ldw 28($key),$t3 + _srm $s3,16,$acc1 +L\$dec_loop + _srm $s2,8,$acc2 + ldwx,s $acc0($tbl),$acc0 + _srm $s1,0,$acc3 + ldwx,s $acc1($tbl),$acc1 + _srm $s1,24,$acc4 + ldwx,s $acc2($tbl),$acc2 + _srm $s0,16,$acc5 + ldwx,s $acc3($tbl),$acc3 + _srm $s3,8,$acc6 + ldwx,s $acc4($tbl),$acc4 + _srm $s2,0,$acc7 + ldwx,s $acc5($tbl),$acc5 + _srm $s2,24,$acc8 + ldwx,s $acc6($tbl),$acc6 + _srm $s1,16,$acc9 + ldwx,s $acc7($tbl),$acc7 + _srm $s0,8,$acc10 + ldwx,s $acc8($tbl),$acc8 + _srm $s3,0,$acc11 + ldwx,s $acc9($tbl),$acc9 + _srm $s3,24,$acc12 + ldwx,s $acc10($tbl),$acc10 + _srm $s2,16,$acc13 + ldwx,s $acc11($tbl),$acc11 + _srm $s1,8,$acc14 + ldwx,s $acc12($tbl),$acc12 + _srm $s0,0,$acc15 + ldwx,s $acc13($tbl),$acc13 + ldwx,s $acc14($tbl),$acc14 + ldwx,s $acc15($tbl),$acc15 + addib,= -1,$rounds,L\$dec_last + ldo 32($key),$key + + _ror $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ldw 0($key),$s0 + _ror $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ldw 4($key),$s1 + _ror $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ldw 8($key),$s2 + _ror $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ldw 12($key),$s3 + _ror $acc6,16,$acc6 + xor $acc4,$t1,$t1 + _ror $acc7,24,$acc7 + xor $acc5,$t1,$t1 + _ror $acc9,8,$acc9 + xor $acc6,$t1,$t1 + _ror $acc10,16,$acc10 + xor $acc7,$t1,$t1 + _ror $acc11,24,$acc11 + xor $acc8,$t2,$t2 + _ror $acc13,8,$acc13 + xor $acc9,$t2,$t2 + _ror $acc14,16,$acc14 + xor $acc10,$t2,$t2 + _ror $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + _srm $t0,24,$acc0 + xor $acc14,$t3,$t3 + xor $acc15,$t3,$t3 + _srm $t3,16,$acc1 + + _srm $t2,8,$acc2 + ldwx,s $acc0($tbl),$acc0 + _srm $t1,0,$acc3 + ldwx,s $acc1($tbl),$acc1 + _srm $t1,24,$acc4 + ldwx,s $acc2($tbl),$acc2 + _srm $t0,16,$acc5 + ldwx,s $acc3($tbl),$acc3 + _srm $t3,8,$acc6 + ldwx,s $acc4($tbl),$acc4 + _srm $t2,0,$acc7 + ldwx,s $acc5($tbl),$acc5 + _srm $t2,24,$acc8 + ldwx,s $acc6($tbl),$acc6 + _srm $t1,16,$acc9 + ldwx,s $acc7($tbl),$acc7 + _srm $t0,8,$acc10 + ldwx,s $acc8($tbl),$acc8 + _srm $t3,0,$acc11 + ldwx,s $acc9($tbl),$acc9 + _srm $t3,24,$acc12 + ldwx,s $acc10($tbl),$acc10 + _srm $t2,16,$acc13 + ldwx,s $acc11($tbl),$acc11 + _srm $t1,8,$acc14 + ldwx,s $acc12($tbl),$acc12 + _srm $t0,0,$acc15 + ldwx,s $acc13($tbl),$acc13 + _ror $acc1,8,$acc1 + ldwx,s $acc14($tbl),$acc14 + + _ror $acc2,16,$acc2 + xor $acc0,$s0,$s0 + ldwx,s $acc15($tbl),$acc15 + _ror $acc3,24,$acc3 + xor $acc1,$s0,$s0 + ldw 16($key),$t0 + _ror $acc5,8,$acc5 + xor $acc2,$s0,$s0 + ldw 20($key),$t1 + _ror $acc6,16,$acc6 + xor $acc3,$s0,$s0 + ldw 24($key),$t2 + _ror $acc7,24,$acc7 + xor $acc4,$s1,$s1 + ldw 28($key),$t3 + _ror $acc9,8,$acc9 + xor $acc5,$s1,$s1 + ldw 1024+0($tbl),%r0 ; prefetch td4 + _ror $acc10,16,$acc10 + xor $acc6,$s1,$s1 + ldw 1024+32($tbl),%r0 ; prefetch td4 + _ror $acc11,24,$acc11 + xor $acc7,$s1,$s1 + ldw 1024+64($tbl),%r0 ; prefetch td4 + _ror $acc13,8,$acc13 + xor $acc8,$s2,$s2 + ldw 1024+96($tbl),%r0 ; prefetch td4 + _ror $acc14,16,$acc14 + xor $acc9,$s2,$s2 + ldw 1024+128($tbl),%r0 ; prefetch td4 + _ror $acc15,24,$acc15 + xor $acc10,$s2,$s2 + ldw 1024+160($tbl),%r0 ; prefetch td4 + _srm $s0,24,$acc0 + xor $acc11,$s2,$s2 + ldw 1024+192($tbl),%r0 ; prefetch td4 + xor $acc12,$acc14,$acc14 + xor $acc13,$s3,$s3 + ldw 1024+224($tbl),%r0 ; prefetch td4 + xor $acc14,$s3,$s3 + xor $acc15,$s3,$s3 + b L\$dec_loop + _srm $s3,16,$acc1 + + .ALIGN 16 +L\$dec_last + ldo 1024($tbl),$rounds + _ror $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ldw 0($key),$s0 + _ror $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ldw 4($key),$s1 + _ror $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ldw 8($key),$s2 + _ror $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ldw 12($key),$s3 + _ror $acc6,16,$acc6 + xor $acc4,$t1,$t1 + _ror $acc7,24,$acc7 + xor $acc5,$t1,$t1 + _ror $acc9,8,$acc9 + xor $acc6,$t1,$t1 + _ror $acc10,16,$acc10 + xor $acc7,$t1,$t1 + _ror $acc11,24,$acc11 + xor $acc8,$t2,$t2 + _ror $acc13,8,$acc13 + xor $acc9,$t2,$t2 + _ror $acc14,16,$acc14 + xor $acc10,$t2,$t2 + _ror $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + _srm $t0,24,$acc0 + xor $acc14,$t3,$t3 + xor $acc15,$t3,$t3 + _srm $t3,16,$acc1 + + _srm $t2,8,$acc2 + ldbx $acc0($rounds),$acc0 + _srm $t1,24,$acc4 + ldbx $acc1($rounds),$acc1 + _srm $t0,16,$acc5 + _srm $t1,0,$acc3 + ldbx $acc2($rounds),$acc2 + ldbx $acc3($rounds),$acc3 + _srm $t3,8,$acc6 + ldbx $acc4($rounds),$acc4 + _srm $t2,24,$acc8 + ldbx $acc5($rounds),$acc5 + _srm $t1,16,$acc9 + _srm $t2,0,$acc7 + ldbx $acc6($rounds),$acc6 + ldbx $acc7($rounds),$acc7 + _srm $t0,8,$acc10 + ldbx $acc8($rounds),$acc8 + _srm $t3,24,$acc12 + ldbx $acc9($rounds),$acc9 + _srm $t2,16,$acc13 + _srm $t3,0,$acc11 + ldbx $acc10($rounds),$acc10 + _srm $t1,8,$acc14 + ldbx $acc11($rounds),$acc11 + ldbx $acc12($rounds),$acc12 + ldbx $acc13($rounds),$acc13 + _srm $t0,0,$acc15 + ldbx $acc14($rounds),$acc14 + + dep $acc0,7,8,$acc3 + ldbx $acc15($rounds),$acc15 + dep $acc4,7,8,$acc7 + dep $acc1,15,8,$acc3 + dep $acc5,15,8,$acc7 + dep $acc2,23,8,$acc3 + dep $acc6,23,8,$acc7 + xor $acc3,$s0,$s0 + xor $acc7,$s1,$s1 + dep $acc8,7,8,$acc11 + dep $acc12,7,8,$acc15 + dep $acc9,15,8,$acc11 + dep $acc13,15,8,$acc15 + dep $acc10,23,8,$acc11 + dep $acc14,23,8,$acc15 + xor $acc11,$s2,$s2 + + bv (%r31) + .EXIT + xor $acc15,$s3,$s3 + .PROCEND + + .ALIGN 64 +L\$AES_Td + .WORD 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 + .WORD 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 + .WORD 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 + .WORD 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f + .WORD 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 + .WORD 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 + .WORD 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da + .WORD 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 + .WORD 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd + .WORD 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 + .WORD 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 + .WORD 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 + .WORD 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 + .WORD 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a + .WORD 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 + .WORD 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c + .WORD 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 + .WORD 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a + .WORD 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 + .WORD 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 + .WORD 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 + .WORD 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff + .WORD 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 + .WORD 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb + .WORD 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 + .WORD 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e + .WORD 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 + .WORD 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a + .WORD 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e + .WORD 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 + .WORD 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d + .WORD 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 + .WORD 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd + .WORD 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 + .WORD 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 + .WORD 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 + .WORD 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d + .WORD 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 + .WORD 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 + .WORD 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef + .WORD 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 + .WORD 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 + .WORD 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 + .WORD 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 + .WORD 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 + .WORD 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b + .WORD 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 + .WORD 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 + .WORD 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 + .WORD 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 + .WORD 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 + .WORD 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f + .WORD 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df + .WORD 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f + .WORD 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e + .WORD 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 + .WORD 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 + .WORD 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c + .WORD 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf + .WORD 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 + .WORD 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f + .WORD 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 + .WORD 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 + .WORD 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 + .BYTE 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .BYTE 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .BYTE 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .BYTE 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .BYTE 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .BYTE 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .BYTE 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .BYTE 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .BYTE 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .BYTE 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .BYTE 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .BYTE 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .BYTE 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .BYTE 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .BYTE 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .BYTE 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .BYTE 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .BYTE 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .BYTE 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .BYTE 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .BYTE 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .BYTE 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .BYTE 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .BYTE 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .BYTE 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .BYTE 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .BYTE 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .BYTE 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .BYTE 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .BYTE 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .BYTE 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .BYTE 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + .STRINGZ "AES for PA-RISC, CRYPTOGAMS by " +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + # translate made up instructons: _ror, _srm + s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/ or + + s/_srm(\s+%r[0-9]+),([0-9]+),/ + $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2) + : sprintf("extrd,u%s,%d,8,",$1,63-$2)/e; + + s/,\*/,/ if ($SIZE_T==4); + print $_,"\n"; +} +close STDOUT; diff --git a/crypto/bn/asm/parisc-mont.pl b/crypto/bn/asm/parisc-mont.pl new file mode 100644 index 0000000000..4a766a87fb --- /dev/null +++ b/crypto/bn/asm/parisc-mont.pl @@ -0,0 +1,993 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# On PA-7100LC this module performs ~90-50% better, less for longer +# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means +# that compiler utilized xmpyu instruction to perform 32x32=64-bit +# multiplication, which in turn means that "baseline" performance was +# optimal in respect to instruction set capabilities. Fair comparison +# with vendor compiler is problematic, because OpenSSL doesn't define +# BN_LLONG [presumably] for historical reasons, which drives compiler +# toward 4 times 16x16=32-bit multiplicatons [plus complementary +# shifts and additions] instead. This means that you should observe +# several times improvement over code generated by vendor compiler +# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual +# improvement coefficient was never collected on PA-7100LC, or any +# other 1.1 CPU, because I don't have access to such machine with +# vendor compiler. But to give you a taste, PA-RISC 1.1 code path +# reportedly outperformed code generated by cc +DA1.1 +O3 by factor +# of ~5x on PA-8600. +# +# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is +# reportedly ~2x faster than vendor compiler generated code [according +# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of +# this implementation is actually 32-bit one, in the sense that it +# operates on 32-bit values. But pa-risc2[W].s operates on arrays of +# 64-bit BN_LONGs... How do they interoperate then? No problem. This +# module picks halves of 64-bit values in reverse order and pretends +# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" +# 64-bit code such as pa-risc2[W].s then? Well, the thing is that +# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, +# i.e. there is no "wider" multiplication like on most other 64-bit +# platforms. This means that even being effectively 32-bit, this +# implementation performs "64-bit" computational task in same amount +# of arithmetic operations, most notably multiplications. It requires +# more memory references, most notably to tp[num], but this doesn't +# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC +# 2.0 code path, provides virtually same performance as pa-risc2[W].s: +# it's ~10% better for shortest key length and ~10% worse for longest +# one. +# +# In case it wasn't clear. The module has two distinct code paths: +# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit +# additions and 64-bit integer loads, not to mention specific +# instruction scheduling. In 64-bit build naturally only 2.0 code path +# is assembled. In 32-bit application context both code paths are +# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path +# is taken automatically. Also, in 32-bit build the module imposes +# couple of limitations: vector lengths has to be even and vector +# addresses has to be 64-bit aligned. Normally neither is a problem: +# most common key lengths are even and vectors are commonly malloc-ed, +# which ensures alignment. +# +# Special thanks to polarhome.com for providing HP-UX account on +# PA-RISC 1.1 machine, and to correspondent who chose to remain +# anonymous for testing the code on PA-RISC 2.0 machine. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + +$flavour = shift; +$output = shift; + +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; + $BN_SZ =$SIZE_T; +} else { + $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; + $BN_SZ =$SIZE_T; + if (open CONF,"<${dir}../../opensslconf.h") { + while() { + if (m/#\s*define\s+SIXTY_FOUR_BIT/) { + $BN_SZ=8; + $LEVEL="2.0"; + last; + } + } + close CONF; + } +} + +$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker + # [+ argument transfer] +$LOCALS=$FRAME-$FRAME_MARKER; +$FRAME+=32; # local variables + +$tp="%r31"; +$ti1="%r29"; +$ti0="%r28"; + +$rp="%r26"; +$ap="%r25"; +$bp="%r24"; +$np="%r23"; +$n0="%r22"; # passed through stack in 32-bit +$num="%r21"; # passed through stack in 32-bit +$idx="%r20"; +$arrsz="%r19"; + +$nm1="%r7"; +$nm0="%r6"; +$ab1="%r5"; +$ab0="%r4"; + +$fp="%r3"; +$hi1="%r2"; +$hi0="%r1"; + +$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s + +$fm0="%fr4"; $fti=$fm0; +$fbi="%fr5L"; +$fn0="%fr5R"; +$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; +$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR + .ALIGN 64 +bn_mul_mont + .PROC + .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + ldo -$FRAME(%sp),$fp +___ +$code.=<<___ if ($SIZE_T==4); + ldw `-$FRAME_MARKER-4`($fp),$n0 + ldw `-$FRAME_MARKER-8`($fp),$num + nop + nop ; alignment +___ +$code.=<<___ if ($BN_SZ==4); + comiclr,<= 6,$num,%r0 ; are vectors long enough? + b L\$abort + ldi 0,%r28 ; signal "unhandled" + add,ev %r0,$num,$num ; is $num even? + b L\$abort + nop + or $ap,$np,$ti1 + extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? + b L\$abort + nop + nop ; alignment + nop + + fldws 0($n0),${fn0} + fldws,ma 4($bp),${fbi} ; bp[0] +___ +$code.=<<___ if ($BN_SZ==8); + comib,> 3,$num,L\$abort ; are vectors long enough? + ldi 0,%r28 ; signal "unhandled" + addl $num,$num,$num ; I operate on 32-bit values + + fldws 4($n0),${fn0} ; only low part of n0 + fldws 4($bp),${fbi} ; bp[0] in flipped word order +___ +$code.=<<___; + fldds 0($ap),${fai} ; ap[0,1] + fldds 0($np),${fni} ; np[0,1] + + sh2addl $num,%r0,$arrsz + ldi 31,$hi0 + ldo 36($arrsz),$hi1 ; space for tp[num+1] + andcm $hi1,$hi0,$hi1 ; align + addl $hi1,%sp,%sp + $PUSH $fp,-$SIZE_T(%sp) + + ldo `$LOCALS+16`($fp),$xfer + ldo `$LOCALS+32+4`($fp),$tp + + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] + xmpyu ${fn0},${fab0}R,${fm0} + + addl $arrsz,$ap,$ap ; point at the end + addl $arrsz,$np,$np + subi 0,$arrsz,$idx ; j=0 + ldo 8($idx),$idx ; j++++ + + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m + fstds ${fab0},-16($xfer) + fstds ${fnm0},-8($xfer) + fstds ${fab1},0($xfer) + fstds ${fnm1},8($xfer) + flddx $idx($ap),${fai} ; ap[2,3] + flddx $idx($np),${fni} ; np[2,3] +___ +$code.=<<___ if ($BN_SZ==4); + mtctl $hi0,%cr11 ; $hi0 still holds 31 + extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 + b L\$parisc11 + nop +___ +$code.=<<___; # PA-RISC 2.0 code-path + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldd -16($xfer),$ab0 + fstds ${fab0},-16($xfer) + + extrd,u $ab0,31,32,$hi0 + extrd,u $ab0,63,32,$ab0 + ldd -8($xfer),$nm0 + fstds ${fnm0},-8($xfer) + ldo 8($idx),$idx ; j++++ + addl $ab0,$nm0,$nm0 ; low part is discarded + extrd,u $nm0,31,32,$hi1 + +L\$1st + xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m + ldd 0($xfer),$ab1 + fstds ${fab1},0($xfer) + addl $hi0,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + ldd 8($xfer),$nm1 + fstds ${fnm1},8($xfer) + extrd,u $ab1,63,32,$ab1 + addl $hi1,$nm1,$nm1 + flddx $idx($ap),${fai} ; ap[j,j+1] + flddx $idx($np),${fni} ; np[j,j+1] + addl $ab1,$nm1,$nm1 + extrd,u $nm1,31,32,$hi1 + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldd -16($xfer),$ab0 + fstds ${fab0},-16($xfer) + addl $hi0,$ab0,$ab0 + extrd,u $ab0,31,32,$hi0 + ldd -8($xfer),$nm0 + fstds ${fnm0},-8($xfer) + extrd,u $ab0,63,32,$ab0 + addl $hi1,$nm0,$nm0 + stw $nm1,-4($tp) ; tp[j-1] + addl $ab0,$nm0,$nm0 + stw,ma $nm0,8($tp) ; tp[j-1] + addib,<> 8,$idx,L\$1st ; j++++ + extrd,u $nm0,31,32,$hi1 + + xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m + ldd 0($xfer),$ab1 + fstds ${fab1},0($xfer) + addl $hi0,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + ldd 8($xfer),$nm1 + fstds ${fnm1},8($xfer) + extrd,u $ab1,63,32,$ab1 + addl $hi1,$nm1,$nm1 + ldd -16($xfer),$ab0 + addl $ab1,$nm1,$nm1 + ldd -8($xfer),$nm0 + extrd,u $nm1,31,32,$hi1 + + addl $hi0,$ab0,$ab0 + extrd,u $ab0,31,32,$hi0 + stw $nm1,-4($tp) ; tp[j-1] + extrd,u $ab0,63,32,$ab0 + addl $hi1,$nm0,$nm0 + ldd 0($xfer),$ab1 + addl $ab0,$nm0,$nm0 + ldd,mb 8($xfer),$nm1 + extrd,u $nm0,31,32,$hi1 + stw,ma $nm0,8($tp) ; tp[j-1] + + ldo -1($num),$num ; i-- + subi 0,$arrsz,$idx ; j=0 +___ +$code.=<<___ if ($BN_SZ==4); + fldws,ma 4($bp),${fbi} ; bp[1] +___ +$code.=<<___ if ($BN_SZ==8); + fldws 0($bp),${fbi} ; bp[1] in flipped word order +___ +$code.=<<___; + flddx $idx($ap),${fai} ; ap[0,1] + flddx $idx($np),${fni} ; np[0,1] + fldws 8($xfer),${fti}R ; tp[0] + addl $hi0,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + ldo 8($idx),$idx ; j++++ + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] + addl $hi1,$nm1,$nm1 + addl $ab1,$nm1,$nm1 + extrd,u $nm1,31,32,$hi1 + fstws,mb ${fab0}L,-8($xfer) ; save high part + stw $nm1,-4($tp) ; tp[j-1] + + fcpy,sgl %fr0,${fti}L ; zero high part + fcpy,sgl %fr0,${fab0}L + addl $hi1,$hi0,$hi0 + extrd,u $hi0,31,32,$hi1 + fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double + fcnvxf,dbl,dbl ${fab0},${fab0} + stw $hi0,0($tp) + stw $hi1,4($tp) + + fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] + fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int + xmpyu ${fn0},${fab0}R,${fm0} + ldo `$LOCALS+32+4`($fp),$tp +L\$outer + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m + fstds ${fab0},-16($xfer) ; 33-bit value + fstds ${fnm0},-8($xfer) + flddx $idx($ap),${fai} ; ap[2] + flddx $idx($np),${fni} ; np[2] + ldo 8($idx),$idx ; j++++ + ldd -16($xfer),$ab0 ; 33-bit value + ldd -8($xfer),$nm0 + ldw 0($xfer),$hi0 ; high part + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + extrd,u $ab0,31,32,$ti0 ; carry bit + extrd,u $ab0,63,32,$ab0 + fstds ${fab1},0($xfer) + addl $ti0,$hi0,$hi0 ; account carry bit + fstds ${fnm1},8($xfer) + addl $ab0,$nm0,$nm0 ; low part is discarded + ldw 0($tp),$ti1 ; tp[1] + extrd,u $nm0,31,32,$hi1 + fstds ${fab0},-16($xfer) + fstds ${fnm0},-8($xfer) + +L\$inner + xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m + ldd 0($xfer),$ab1 + fstds ${fab1},0($xfer) + addl $hi0,$ti1,$ti1 + addl $ti1,$ab1,$ab1 + ldd 8($xfer),$nm1 + fstds ${fnm1},8($xfer) + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + flddx $idx($ap),${fai} ; ap[j,j+1] + flddx $idx($np),${fni} ; np[j,j+1] + addl $hi1,$nm1,$nm1 + addl $ab1,$nm1,$nm1 + ldw 4($tp),$ti0 ; tp[j] + stw $nm1,-4($tp) ; tp[j-1] + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldd -16($xfer),$ab0 + fstds ${fab0},-16($xfer) + addl $hi0,$ti0,$ti0 + addl $ti0,$ab0,$ab0 + ldd -8($xfer),$nm0 + fstds ${fnm0},-8($xfer) + extrd,u $ab0,31,32,$hi0 + extrd,u $nm1,31,32,$hi1 + ldw 8($tp),$ti1 ; tp[j] + extrd,u $ab0,63,32,$ab0 + addl $hi1,$nm0,$nm0 + addl $ab0,$nm0,$nm0 + stw,ma $nm0,8($tp) ; tp[j-1] + addib,<> 8,$idx,L\$inner ; j++++ + extrd,u $nm0,31,32,$hi1 + + xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m + ldd 0($xfer),$ab1 + fstds ${fab1},0($xfer) + addl $hi0,$ti1,$ti1 + addl $ti1,$ab1,$ab1 + ldd 8($xfer),$nm1 + fstds ${fnm1},8($xfer) + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + ldw 4($tp),$ti0 ; tp[j] + addl $hi1,$nm1,$nm1 + addl $ab1,$nm1,$nm1 + ldd -16($xfer),$ab0 + ldd -8($xfer),$nm0 + extrd,u $nm1,31,32,$hi1 + + addl $hi0,$ab0,$ab0 + addl $ti0,$ab0,$ab0 + stw $nm1,-4($tp) ; tp[j-1] + extrd,u $ab0,31,32,$hi0 + ldw 8($tp),$ti1 ; tp[j] + extrd,u $ab0,63,32,$ab0 + addl $hi1,$nm0,$nm0 + ldd 0($xfer),$ab1 + addl $ab0,$nm0,$nm0 + ldd,mb 8($xfer),$nm1 + extrd,u $nm0,31,32,$hi1 + stw,ma $nm0,8($tp) ; tp[j-1] + + addib,= -1,$num,L\$outerdone ; i-- + subi 0,$arrsz,$idx ; j=0 +___ +$code.=<<___ if ($BN_SZ==4); + fldws,ma 4($bp),${fbi} ; bp[i] +___ +$code.=<<___ if ($BN_SZ==8); + ldi 12,$ti0 ; bp[i] in flipped word order + addl,ev %r0,$num,$num + ldi -4,$ti0 + addl $ti0,$bp,$bp + fldws 0($bp),${fbi} +___ +$code.=<<___; + flddx $idx($ap),${fai} ; ap[0] + addl $hi0,$ab1,$ab1 + flddx $idx($np),${fni} ; np[0] + fldws 8($xfer),${fti}R ; tp[0] + addl $ti1,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + + ldo 8($idx),$idx ; j++++ + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] + ldw 4($tp),$ti0 ; tp[j] + + addl $hi1,$nm1,$nm1 + fstws,mb ${fab0}L,-8($xfer) ; save high part + addl $ab1,$nm1,$nm1 + extrd,u $nm1,31,32,$hi1 + fcpy,sgl %fr0,${fti}L ; zero high part + fcpy,sgl %fr0,${fab0}L + stw $nm1,-4($tp) ; tp[j-1] + + fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double + fcnvxf,dbl,dbl ${fab0},${fab0} + addl $hi1,$hi0,$hi0 + fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] + addl $ti0,$hi0,$hi0 + extrd,u $hi0,31,32,$hi1 + fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int + stw $hi0,0($tp) + stw $hi1,4($tp) + xmpyu ${fn0},${fab0}R,${fm0} + + b L\$outer + ldo `$LOCALS+32+4`($fp),$tp + +L\$outerdone + addl $hi0,$ab1,$ab1 + addl $ti1,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + + ldw 4($tp),$ti0 ; tp[j] + + addl $hi1,$nm1,$nm1 + addl $ab1,$nm1,$nm1 + extrd,u $nm1,31,32,$hi1 + stw $nm1,-4($tp) ; tp[j-1] + + addl $hi1,$hi0,$hi0 + addl $ti0,$hi0,$hi0 + extrd,u $hi0,31,32,$hi1 + stw $hi0,0($tp) + stw $hi1,4($tp) + + ldo `$LOCALS+32`($fp),$tp + sub %r0,%r0,%r0 ; clear borrow +___ +$code.=<<___ if ($BN_SZ==4); + ldws,ma 4($tp),$ti0 + extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? + b L\$sub_pa11 + addl $tp,$arrsz,$tp +L\$sub + ldwx $idx($np),$hi0 + subb $ti0,$hi0,$hi1 + ldwx $idx($tp),$ti0 + addib,<> 4,$idx,L\$sub + stws,ma $hi1,4($rp) + + subb $ti0,%r0,$hi1 + ldo -4($tp),$tp +___ +$code.=<<___ if ($BN_SZ==8); + ldd,ma 8($tp),$ti0 +L\$sub + ldd $idx($np),$hi0 + shrpd $ti0,$ti0,32,$ti0 ; flip word order + std $ti0,-8($tp) ; save flipped value + sub,db $ti0,$hi0,$hi1 + ldd,ma 8($tp),$ti0 + addib,<> 8,$idx,L\$sub + std,ma $hi1,8($rp) + + extrd,u $ti0,31,32,$ti0 ; carry in flipped word order + sub,db $ti0,%r0,$hi1 + ldo -8($tp),$tp +___ +$code.=<<___; + and $tp,$hi1,$ap + andcm $rp,$hi1,$bp + or $ap,$bp,$np + + sub $rp,$arrsz,$rp ; rewind rp + subi 0,$arrsz,$idx + ldo `$LOCALS+32`($fp),$tp +L\$copy + ldd $idx($np),$hi0 + std,ma %r0,8($tp) + addib,<> 8,$idx,.-8 ; L\$copy + std,ma $hi0,8($rp) +___ + +if ($BN_SZ==4) { # PA-RISC 1.1 code-path +$ablo=$ab0; +$abhi=$ab1; +$nmlo0=$nm0; +$nmhi0=$nm1; +$nmlo1="%r9"; +$nmhi1="%r8"; + +$code.=<<___; + b L\$done + nop + + .ALIGN 8 +L\$parisc11 + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldw -12($xfer),$ablo + ldw -16($xfer),$hi0 + ldw -4($xfer),$nmlo0 + ldw -8($xfer),$nmhi0 + fstds ${fab0},-16($xfer) + fstds ${fnm0},-8($xfer) + + ldo 8($idx),$idx ; j++++ + add $ablo,$nmlo0,$nmlo0 ; discarded + addc %r0,$nmhi0,$hi1 + ldw 4($xfer),$ablo + ldw 0($xfer),$abhi + nop + +L\$1st_pa11 + xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] + flddx $idx($ap),${fai} ; ap[j,j+1] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m + flddx $idx($np),${fni} ; np[j,j+1] + add $hi0,$ablo,$ablo + ldw 12($xfer),$nmlo1 + addc %r0,$abhi,$hi0 + ldw 8($xfer),$nmhi1 + add $ablo,$nmlo1,$nmlo1 + fstds ${fab1},0($xfer) + addc %r0,$nmhi1,$nmhi1 + fstds ${fnm1},8($xfer) + add $hi1,$nmlo1,$nmlo1 + ldw -12($xfer),$ablo + addc %r0,$nmhi1,$hi1 + ldw -16($xfer),$abhi + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] + ldw -4($xfer),$nmlo0 + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldw -8($xfer),$nmhi0 + add $hi0,$ablo,$ablo + stw $nmlo1,-4($tp) ; tp[j-1] + addc %r0,$abhi,$hi0 + fstds ${fab0},-16($xfer) + add $ablo,$nmlo0,$nmlo0 + fstds ${fnm0},-8($xfer) + addc %r0,$nmhi0,$nmhi0 + ldw 0($xfer),$abhi + add $hi1,$nmlo0,$nmlo0 + ldw 4($xfer),$ablo + stws,ma $nmlo0,8($tp) ; tp[j-1] + addib,<> 8,$idx,L\$1st_pa11 ; j++++ + addc %r0,$nmhi0,$hi1 + + ldw 8($xfer),$nmhi1 + ldw 12($xfer),$nmlo1 + xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m + add $hi0,$ablo,$ablo + fstds ${fab1},0($xfer) + addc %r0,$abhi,$hi0 + fstds ${fnm1},8($xfer) + add $ablo,$nmlo1,$nmlo1 + ldw -16($xfer),$abhi + addc %r0,$nmhi1,$nmhi1 + ldw -12($xfer),$ablo + add $hi1,$nmlo1,$nmlo1 + ldw -8($xfer),$nmhi0 + addc %r0,$nmhi1,$hi1 + ldw -4($xfer),$nmlo0 + + add $hi0,$ablo,$ablo + stw $nmlo1,-4($tp) ; tp[j-1] + addc %r0,$abhi,$hi0 + ldw 0($xfer),$abhi + add $ablo,$nmlo0,$nmlo0 + ldw 4($xfer),$ablo + addc %r0,$nmhi0,$nmhi0 + ldws,mb 8($xfer),$nmhi1 + add $hi1,$nmlo0,$nmlo0 + ldw 4($xfer),$nmlo1 + addc %r0,$nmhi0,$hi1 + stws,ma $nmlo0,8($tp) ; tp[j-1] + + ldo -1($num),$num ; i-- + subi 0,$arrsz,$idx ; j=0 + + fldws,ma 4($bp),${fbi} ; bp[1] + flddx $idx($ap),${fai} ; ap[0,1] + flddx $idx($np),${fni} ; np[0,1] + fldws 8($xfer),${fti}R ; tp[0] + add $hi0,$ablo,$ablo + addc %r0,$abhi,$hi0 + ldo 8($idx),$idx ; j++++ + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] + add $hi1,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$nmhi1 + add $ablo,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$hi1 + fstws,mb ${fab0}L,-8($xfer) ; save high part + stw $nmlo1,-4($tp) ; tp[j-1] + + fcpy,sgl %fr0,${fti}L ; zero high part + fcpy,sgl %fr0,${fab0}L + add $hi1,$hi0,$hi0 + addc %r0,%r0,$hi1 + fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double + fcnvxf,dbl,dbl ${fab0},${fab0} + stw $hi0,0($tp) + stw $hi1,4($tp) + + fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] + fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int + xmpyu ${fn0},${fab0}R,${fm0} + ldo `$LOCALS+32+4`($fp),$tp +L\$outer_pa11 + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m + fstds ${fab0},-16($xfer) ; 33-bit value + fstds ${fnm0},-8($xfer) + flddx $idx($ap),${fai} ; ap[2,3] + flddx $idx($np),${fni} ; np[2,3] + ldw -16($xfer),$abhi ; carry bit actually + ldo 8($idx),$idx ; j++++ + ldw -12($xfer),$ablo + ldw -8($xfer),$nmhi0 + ldw -4($xfer),$nmlo0 + ldw 0($xfer),$hi0 ; high part + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + fstds ${fab1},0($xfer) + addl $abhi,$hi0,$hi0 ; account carry bit + fstds ${fnm1},8($xfer) + add $ablo,$nmlo0,$nmlo0 ; discarded + ldw 0($tp),$ti1 ; tp[1] + addc %r0,$nmhi0,$hi1 + fstds ${fab0},-16($xfer) + fstds ${fnm0},-8($xfer) + ldw 4($xfer),$ablo + ldw 0($xfer),$abhi + +L\$inner_pa11 + xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] + flddx $idx($ap),${fai} ; ap[j,j+1] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m + flddx $idx($np),${fni} ; np[j,j+1] + add $hi0,$ablo,$ablo + ldw 4($tp),$ti0 ; tp[j] + addc %r0,$abhi,$abhi + ldw 12($xfer),$nmlo1 + add $ti1,$ablo,$ablo + ldw 8($xfer),$nmhi1 + addc %r0,$abhi,$hi0 + fstds ${fab1},0($xfer) + add $ablo,$nmlo1,$nmlo1 + fstds ${fnm1},8($xfer) + addc %r0,$nmhi1,$nmhi1 + ldw -12($xfer),$ablo + add $hi1,$nmlo1,$nmlo1 + ldw -16($xfer),$abhi + addc %r0,$nmhi1,$hi1 + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] + ldw 8($tp),$ti1 ; tp[j] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldw -4($xfer),$nmlo0 + add $hi0,$ablo,$ablo + ldw -8($xfer),$nmhi0 + addc %r0,$abhi,$abhi + stw $nmlo1,-4($tp) ; tp[j-1] + add $ti0,$ablo,$ablo + fstds ${fab0},-16($xfer) + addc %r0,$abhi,$hi0 + fstds ${fnm0},-8($xfer) + add $ablo,$nmlo0,$nmlo0 + ldw 4($xfer),$ablo + addc %r0,$nmhi0,$nmhi0 + ldw 0($xfer),$abhi + add $hi1,$nmlo0,$nmlo0 + stws,ma $nmlo0,8($tp) ; tp[j-1] + addib,<> 8,$idx,L\$inner_pa11 ; j++++ + addc %r0,$nmhi0,$hi1 + + xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] + ldw 12($xfer),$nmlo1 + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m + ldw 8($xfer),$nmhi1 + add $hi0,$ablo,$ablo + ldw 4($tp),$ti0 ; tp[j] + addc %r0,$abhi,$abhi + fstds ${fab1},0($xfer) + add $ti1,$ablo,$ablo + fstds ${fnm1},8($xfer) + addc %r0,$abhi,$hi0 + ldw -16($xfer),$abhi + add $ablo,$nmlo1,$nmlo1 + ldw -12($xfer),$ablo + addc %r0,$nmhi1,$nmhi1 + ldw -8($xfer),$nmhi0 + add $hi1,$nmlo1,$nmlo1 + ldw -4($xfer),$nmlo0 + addc %r0,$nmhi1,$hi1 + + add $hi0,$ablo,$ablo + stw $nmlo1,-4($tp) ; tp[j-1] + addc %r0,$abhi,$abhi + add $ti0,$ablo,$ablo + ldw 8($tp),$ti1 ; tp[j] + addc %r0,$abhi,$hi0 + ldw 0($xfer),$abhi + add $ablo,$nmlo0,$nmlo0 + ldw 4($xfer),$ablo + addc %r0,$nmhi0,$nmhi0 + ldws,mb 8($xfer),$nmhi1 + add $hi1,$nmlo0,$nmlo0 + ldw 4($xfer),$nmlo1 + addc %r0,$nmhi0,$hi1 + stws,ma $nmlo0,8($tp) ; tp[j-1] + + addib,= -1,$num,L\$outerdone_pa11; i-- + subi 0,$arrsz,$idx ; j=0 + + fldws,ma 4($bp),${fbi} ; bp[i] + flddx $idx($ap),${fai} ; ap[0] + add $hi0,$ablo,$ablo + addc %r0,$abhi,$abhi + flddx $idx($np),${fni} ; np[0] + fldws 8($xfer),${fti}R ; tp[0] + add $ti1,$ablo,$ablo + addc %r0,$abhi,$hi0 + + ldo 8($idx),$idx ; j++++ + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] + ldw 4($tp),$ti0 ; tp[j] + + add $hi1,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$nmhi1 + fstws,mb ${fab0}L,-8($xfer) ; save high part + add $ablo,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$hi1 + fcpy,sgl %fr0,${fti}L ; zero high part + fcpy,sgl %fr0,${fab0}L + stw $nmlo1,-4($tp) ; tp[j-1] + + fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double + fcnvxf,dbl,dbl ${fab0},${fab0} + add $hi1,$hi0,$hi0 + addc %r0,%r0,$hi1 + fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] + add $ti0,$hi0,$hi0 + addc %r0,$hi1,$hi1 + fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int + stw $hi0,0($tp) + stw $hi1,4($tp) + xmpyu ${fn0},${fab0}R,${fm0} + + b L\$outer_pa11 + ldo `$LOCALS+32+4`($fp),$tp + +L\$outerdone_pa11 + add $hi0,$ablo,$ablo + addc %r0,$abhi,$abhi + add $ti1,$ablo,$ablo + addc %r0,$abhi,$hi0 + + ldw 4($tp),$ti0 ; tp[j] + + add $hi1,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$nmhi1 + add $ablo,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$hi1 + stw $nmlo1,-4($tp) ; tp[j-1] + + add $hi1,$hi0,$hi0 + addc %r0,%r0,$hi1 + add $ti0,$hi0,$hi0 + addc %r0,$hi1,$hi1 + stw $hi0,0($tp) + stw $hi1,4($tp) + + ldo `$LOCALS+32+4`($fp),$tp + sub %r0,%r0,%r0 ; clear borrow + ldw -4($tp),$ti0 + addl $tp,$arrsz,$tp +L\$sub_pa11 + ldwx $idx($np),$hi0 + subb $ti0,$hi0,$hi1 + ldwx $idx($tp),$ti0 + addib,<> 4,$idx,L\$sub_pa11 + stws,ma $hi1,4($rp) + + subb $ti0,%r0,$hi1 + ldo -4($tp),$tp + and $tp,$hi1,$ap + andcm $rp,$hi1,$bp + or $ap,$bp,$np + + sub $rp,$arrsz,$rp ; rewind rp + subi 0,$arrsz,$idx + ldo `$LOCALS+32`($fp),$tp +L\$copy_pa11 + ldwx $idx($np),$hi0 + stws,ma %r0,4($tp) + addib,<> 4,$idx,L\$copy_pa11 + stws,ma $hi0,4($rp) + + nop ; alignment +L\$done +___ +} + +$code.=<<___; + ldi 1,%r28 ; signal "handled" + ldo $FRAME($fp),%sp ; destroy tp[num+1] + + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 +L\$abort + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by " +___ + +# Explicitly encode PA-RISC 2.0 instructions used in this module, so +# that it can be compiled with .LEVEL 1.0. It should be noted that I +# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 +# directive... + +my $ldd = sub { + my ($mod,$args) = @_; + my $orig = "ldd$mod\t$args"; + + if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 + { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 + { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; + $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset + $opcode|=(1<<5) if ($mod =~ /^,m/); + $opcode|=(1<<13) if ($mod =~ /^,mb/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $std = sub { + my ($mod,$args) = @_; + my $orig = "std$mod\t$args"; + + if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 + { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); + $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset + $opcode|=(1<<5) if ($mod =~ /^,m/); + $opcode|=(1<<13) if ($mod =~ /^,mb/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $extrd = sub { + my ($mod,$args) = @_; + my $orig = "extrd$mod\t$args"; + + # I only have ",u" completer, it's implicitly encoded... + if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 + { my $opcode=(0x36<<26)|($1<<21)|($4<<16); + my $len=32-$3; + $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos + $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 + { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); + my $len=32-$2; + $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len + $opcode |= (1<<13) if ($mod =~ /,\**=/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $shrpd = sub { + my ($mod,$args) = @_; + my $orig = "shrpd$mod\t$args"; + + if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 + { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; + my $cpos=63-$3; + $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $sub = sub { + my ($mod,$args) = @_; + my $orig = "sub$mod\t$args"; + + if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { + my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; + $opcode|=(1<<10); # e1 + $opcode|=(1<<8); # e2 + $opcode|=(1<<5); # d + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig + } + else { "\t".$orig; } +}; + +sub assemble { + my ($mnemonic,$mod,$args)=@_; + my $opcode = eval("\$$mnemonic"); + + ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + # flip word order in 64-bit mode... + s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); + # assemble 2.0 instructions in 32-bit mode... + s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); + + print $_,"\n"; +} +close STDOUT; diff --git a/crypto/pariscid.pl b/crypto/pariscid.pl new file mode 100644 index 0000000000..477ec9b87d --- /dev/null +++ b/crypto/pariscid.pl @@ -0,0 +1,224 @@ +#!/usr/bin/env perl + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $ST ="std"; +} else { + $LEVEL ="1.1"; + $SIZE_T =4; + $ST ="stw"; +} + +$rp="%r2"; +$sp="%r30"; +$rv="%r28"; + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT OPENSSL_cpuid_setup,ENTRY + .ALIGN 8 +OPENSSL_cpuid_setup + .PROC + .CALLINFO NO_CALLS + .ENTRY + bv ($rp) + .EXIT + nop + .PROCEND + + .EXPORT OPENSSL_rdtsc,ENTRY + .ALIGN 8 +OPENSSL_rdtsc + .PROC + .CALLINFO NO_CALLS + .ENTRY + mfctl %cr16,$rv + bv ($rp) + .EXIT + nop + .PROCEND + + .EXPORT OPENSSL_wipe_cpu,ENTRY + .ALIGN 8 +OPENSSL_wipe_cpu + .PROC + .CALLINFO NO_CALLS + .ENTRY + xor %r0,%r0,%r1 + fcpy,dbl %fr0,%fr4 + xor %r0,%r0,%r19 + fcpy,dbl %fr0,%fr5 + xor %r0,%r0,%r20 + fcpy,dbl %fr0,%fr6 + xor %r0,%r0,%r21 + fcpy,dbl %fr0,%fr7 + xor %r0,%r0,%r22 + fcpy,dbl %fr0,%fr8 + xor %r0,%r0,%r23 + fcpy,dbl %fr0,%fr9 + xor %r0,%r0,%r24 + fcpy,dbl %fr0,%fr10 + xor %r0,%r0,%r25 + fcpy,dbl %fr0,%fr11 + xor %r0,%r0,%r26 + fcpy,dbl %fr0,%fr22 + xor %r0,%r0,%r29 + fcpy,dbl %fr0,%fr23 + xor %r0,%r0,%r31 + fcpy,dbl %fr0,%fr24 + fcpy,dbl %fr0,%fr25 + fcpy,dbl %fr0,%fr26 + fcpy,dbl %fr0,%fr27 + fcpy,dbl %fr0,%fr28 + fcpy,dbl %fr0,%fr29 + fcpy,dbl %fr0,%fr30 + fcpy,dbl %fr0,%fr31 + bv ($rp) + .EXIT + ldo 0($sp),$rv + .PROCEND +___ +{ +my $inp="%r26"; +my $len="%r25"; + +$code.=<<___; + .EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR + .ALIGN 8 +OPENSSL_cleanse + .PROC + .CALLINFO NO_CALLS + .ENTRY + cmpib,*= 0,$len,Ldone + nop + cmpib,*>>= 15,$len,Little + ldi $SIZE_T-1,%r1 + +Lalign + and,*<> $inp,%r1,%r28 + b,n Laligned + stb %r0,0($inp) + ldo -1($len),$len + b Lalign + ldo 1($inp),$inp + +Laligned + andcm $len,%r1,%r28 +Lot + $ST %r0,0($inp) + addib,*<> -$SIZE_T,%r28,Lot + ldo $SIZE_T($inp),$inp + + and,*<> $len,%r1,$len + b,n Ldone +Little + stb %r0,0($inp) + addib,*<> -1,$len,Little + ldo 1($inp),$inp +Ldone + bv ($rp) + .EXIT + nop + .PROCEND +___ +} +{ +my ($out,$cnt,$max)=("%r26","%r25","%r24"); +my ($tick,$lasttick)=("%r23","%r22"); +my ($diff,$lastdiff)=("%r21","%r20"); + +$code.=<<___; + .EXPORT OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR + .ALIGN 8 +OPENSSL_instrument_bus + .PROC + .CALLINFO NO_CALLS + .ENTRY + copy $cnt,$rv + mfctl %cr16,$tick + copy $tick,$lasttick + ldi 0,$diff + + fdc 0($out) + ldw 0($out),$tick + add $diff,$tick,$tick + stw $tick,0($out) +Loop + mfctl %cr16,$tick + sub $tick,$lasttick,$diff + copy $tick,$lasttick + + fdc 0($out) + ldw 0($out),$tick + add $diff,$tick,$tick + stw $tick,0($out) + + addib,<> -1,$cnt,Loop + addi 4,$out,$out + + bv ($rp) + .EXIT + sub $rv,$cnt,$rv + .PROCEND + + .EXPORT OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR + .ALIGN 8 +OPENSSL_instrument_bus2 + .PROC + .CALLINFO NO_CALLS + .ENTRY + copy $cnt,$rv + sub %r0,$cnt,$cnt + + mfctl %cr16,$tick + copy $tick,$lasttick + ldi 0,$diff + + fdc 0($out) + ldw 0($out),$tick + add $diff,$tick,$tick + stw $tick,0($out) + + mfctl %cr16,$tick + sub $tick,$lasttick,$diff + copy $tick,$lasttick +Loop2 + copy $diff,$lastdiff + fdc 0($out) + ldw 0($out),$tick + add $diff,$tick,$tick + stw $tick,0($out) + + addib,= -1,$max,Ldone2 + nop + + mfctl %cr16,$tick + sub $tick,$lasttick,$diff + copy $tick,$lasttick + cmpclr,<> $lastdiff,$diff,$tick + ldi 1,$tick + + ldi 1,%r1 + xor %r1,$tick,$tick + addb,<> $tick,$cnt,Loop2 + shladd,l $tick,2,$out,$out +Ldone2 + bv ($rp) + .EXIT + add $rv,$cnt,$rv + .PROCEND +___ +} +$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); +$code =~ s/,\*/,/gm if ($SIZE_T==4); +print $code; +close STDOUT; + diff --git a/crypto/rc4/asm/rc4-parisc.pl b/crypto/rc4/asm/rc4-parisc.pl new file mode 100644 index 0000000000..9e681965b9 --- /dev/null +++ b/crypto/rc4/asm/rc4-parisc.pl @@ -0,0 +1,313 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# RC4 for PA-RISC. + +# June 2009. +# +# Performance is 33% better than gcc 3.2 generated code on PA-7100LC. +# For reference, [4x] unrolled loop is >40% faster than folded one. +# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement +# is believed to be not sufficient to justify the effort... +# +# Special thanks to polarhome.com for providing HP-UX account. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker + # [+ argument transfer] +$SZ=1; # defaults to RC4_CHAR +if (open CONF,"<${dir}../../opensslconf.h") { + while() { + if (m/#\s*define\s+RC4_INT\s+(.*)/) { + $SZ = ($1=~/char$/) ? 1 : 4; + last; + } + } + close CONF; +} + +if ($SZ==1) { # RC4_CHAR + $LD="ldb"; + $LDX="ldbx"; + $MKX="addl"; + $ST="stb"; +} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) + $LD="ldw"; + $LDX="ldwx,s"; + $MKX="sh2addl"; + $ST="stw"; +} + +$key="%r26"; +$len="%r25"; +$inp="%r24"; +$out="%r23"; + +@XX=("%r19","%r20"); +@TX=("%r21","%r22"); +$YY="%r28"; +$TY="%r29"; + +$acc="%r1"; +$ix="%r2"; +$iy="%r3"; +$dat0="%r4"; +$dat1="%r5"; +$rem="%r6"; +$mask="%r31"; + +sub unrolledloopbody { +for ($i=0;$i<4;$i++) { +$code.=<<___; + ldo 1($XX[0]),$XX[1] + `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` + and $mask,$XX[1],$XX[1] + $LDX $YY($key),$TY + $MKX $YY,$key,$ix + $LDX $XX[1]($key),$TX[1] + $MKX $XX[0],$key,$iy + $ST $TX[0],0($ix) + comclr,<> $XX[1],$YY,%r0 ; conditional + copy $TX[0],$TX[1] ; move + `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` + $ST $TY,0($iy) + addl $TX[0],$TY,$TY + addl $TX[1],$YY,$YY + and $mask,$TY,$TY + and $mask,$YY,$YY +___ +push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers +} } + +sub foldedloop { +my ($label,$count)=@_; +$code.=<<___; +$label + $MKX $YY,$key,$iy + $LDX $YY($key),$TY + $MKX $XX[0],$key,$ix + $ST $TX[0],0($iy) + ldo 1($XX[0]),$XX[0] + $ST $TY,0($ix) + addl $TX[0],$TY,$TY + ldbx $inp($out),$dat1 + and $mask,$TY,$TY + and $mask,$XX[0],$XX[0] + $LDX $TY($key),$acc + $LDX $XX[0]($key),$TX[0] + ldo 1($out),$out + xor $dat1,$acc,$acc + addl $TX[0],$YY,$YY + stb $acc,-1($out) + addib,<> -1,$count,$label ; $count is always small + and $mask,$YY,$YY +___ +} + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR +RC4 + .PROC + .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + + cmpib,*= 0,$len,L\$abort + sub $inp,$out,$inp ; distance between $inp and $out + + $LD `0*$SZ`($key),$XX[0] + $LD `1*$SZ`($key),$YY + ldo `2*$SZ`($key),$key + + ldi 0xff,$mask + ldi 3,$dat0 + + ldo 1($XX[0]),$XX[0] ; warm up loop + and $mask,$XX[0],$XX[0] + $LDX $XX[0]($key),$TX[0] + addl $TX[0],$YY,$YY + cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? + and $mask,$YY,$YY + + and,<> $out,$dat0,$rem ; is $out aligned? + b L\$alignedout + subi 4,$rem,$rem + sub $len,$rem,$len +___ +&foldedloop("L\$alignout",$rem); # process till $out is aligned + +$code.=<<___; +L\$alignedout ; $len is at least 4 here + and,<> $inp,$dat0,$acc ; is $inp aligned? + b L\$oop4 + sub $inp,$acc,$rem ; align $inp + + sh3addl $acc,%r0,$acc + subi 32,$acc,$acc + mtctl $acc,%cr11 ; load %sar with vshd align factor + ldwx $rem($out),$dat0 + ldo 4($rem),$rem +L\$oop4misalignedinp +___ +&unrolledloopbody(); +$code.=<<___; + $LDX $TY($key),$ix + ldwx $rem($out),$dat1 + ldo -4($len),$len + or $ix,$acc,$acc ; last piece, no need to dep + vshd $dat0,$dat1,$iy ; align data + copy $dat1,$dat0 + xor $iy,$acc,$acc + stw $acc,0($out) + cmpib,*<< 3,$len,L\$oop4misalignedinp + ldo 4($out),$out + cmpib,*= 0,$len,L\$done + nop + b L\$oop1 + nop + + .ALIGN 8 +L\$oop4 +___ +&unrolledloopbody(); +$code.=<<___; + $LDX $TY($key),$ix + ldwx $inp($out),$dat0 + ldo -4($len),$len + or $ix,$acc,$acc ; last piece, no need to dep + xor $dat0,$acc,$acc + stw $acc,0($out) + cmpib,*<< 3,$len,L\$oop4 + ldo 4($out),$out + cmpib,*= 0,$len,L\$done + nop +___ +&foldedloop("L\$oop1",$len); +$code.=<<___; +L\$done + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 + ldo -1($XX[0]),$XX[0] ; chill out loop + sub $YY,$TX[0],$YY + and $mask,$XX[0],$XX[0] + and $mask,$YY,$YY + $ST $XX[0],`-2*$SZ`($key) + $ST $YY,`-1*$SZ`($key) + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 +L\$abort + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND +___ + +$code.=<<___; + + .EXPORT RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 8 +RC4_set_key + .PROC + .CALLINFO NO_CALLS + .ENTRY + $ST %r0,`0*$SZ`($key) + $ST %r0,`1*$SZ`($key) + ldo `2*$SZ`($key),$key + copy %r0,@XX[0] +L\$1st + $ST @XX[0],0($key) + ldo 1(@XX[0]),@XX[0] + bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 + ldo $SZ($key),$key + + ldo `-256*$SZ`($key),$key ; rewind $key + addl $len,$inp,$inp ; $inp to point at the end + sub %r0,$len,%r23 ; inverse index + copy %r0,@XX[0] + copy %r0,@XX[1] + ldi 0xff,$mask + +L\$2nd + $LDX @XX[0]($key),@TX[0] + ldbx %r23($inp),@TX[1] + addi,nuv 1,%r23,%r23 ; increment and conditional + sub %r0,$len,%r23 ; inverse index + addl @TX[0],@XX[1],@XX[1] + addl @TX[1],@XX[1],@XX[1] + and $mask,@XX[1],@XX[1] + $MKX @XX[0],$key,$TY + $LDX @XX[1]($key),@TX[1] + $MKX @XX[1],$key,$YY + ldo 1(@XX[0]),@XX[0] + $ST @TX[0],0($YY) + bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 + $ST @TX[1],0($TY) + + bv,n (%r2) + .EXIT + nop + .PROCEND + + .EXPORT RC4_options,ENTRY + .ALIGN 8 +RC4_options + .PROC + .CALLINFO NO_CALLS + .ENTRY + blr %r0,%r28 + ldi 3,%r1 +L\$pic + andcm %r28,%r1,%r28 + bv (%r2) + .EXIT + ldo L\$opts-L\$pic(%r28),%r28 + .PROCEND + .ALIGN 8 +L\$opts + .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" + .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by " +___ +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); + +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha1-parisc.pl b/crypto/sha/asm/sha1-parisc.pl new file mode 100644 index 0000000000..6d7bf495b2 --- /dev/null +++ b/crypto/sha/asm/sha1-parisc.pl @@ -0,0 +1,259 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA1 block procedure for PA-RISC. + +# June 2009. +# +# On PA-7100LC performance is >30% better than gcc 3.2 generated code +# for aligned input and >50% better for unaligned. Compared to vendor +# compiler on PA-8600 it's almost 60% faster in 64-bit build and just +# few percent faster in 32-bit one (this for aligned input, data for +# unaligned input is not available). +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker + # [+ argument transfer] +$ctx="%r26"; # arg0 +$inp="%r25"; # arg1 +$num="%r24"; # arg2 + +$t0="%r28"; +$t1="%r29"; +$K="%r31"; + +@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", + "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0); + +@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23"); + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<15); + addl $K,$e,$e ; $i + shd $a,$a,27,$t1 + addl @X[$i],$e,$e + and $c,$b,$t0 + addl $t1,$e,$e + andcm $d,$b,$t1 + shd $b,$b,2,$b + or $t1,$t0,$t0 + addl $t0,$e,$e +___ +$code.=<<___ if ($i>=15); # with forward Xupdate + addl $K,$e,$e ; $i + shd $a,$a,27,$t1 + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + addl @X[$i%16],$e,$e + and $c,$b,$t0 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + addl $t1,$e,$e + andcm $d,$b,$t1 + shd $b,$b,2,$b + or $t1,$t0,$t0 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + add $t0,$e,$e + shd @X[$j%16],@X[$j%16],31,@X[$j%16] +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i + addl $K,$e,$e + shd $a,$a,27,$t1 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + addl @X[$i%16],$e,$e + xor $b,$c,$t0 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + addl $t1,$e,$e + shd $b,$b,2,$b + xor $d,$t0,$t0 + shd @X[$j%16],@X[$j%16],31,@X[$j%16] + addl $t0,$e,$e +___ +$code.=<<___ if ($i==79); # with context load + ldw 0($ctx),@X[0] ; $i + addl $K,$e,$e + shd $a,$a,27,$t1 + ldw 4($ctx),@X[1] + addl @X[$i%16],$e,$e + xor $b,$c,$t0 + ldw 8($ctx),@X[2] + addl $t1,$e,$e + shd $b,$b,2,$b + xor $d,$t0,$t0 + ldw 12($ctx),@X[3] + addl $t0,$e,$e + ldw 16($ctx),@X[4] +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___; + shd $a,$a,27,$t1 ; $i + addl $K,$e,$e + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + xor $d,$c,$t0 + addl @X[$i%16],$e,$e + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + and $b,$t0,$t0 + addl $t1,$e,$e + shd $b,$b,2,$b + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + addl $t0,$e,$e + and $d,$c,$t1 + shd @X[$j%16],@X[$j%16],31,@X[$j%16] + addl $t1,$e,$e +___ +} + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR +sha1_block_data_order + .PROC + .CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + + ldw 0($ctx),$A + ldw 4($ctx),$B + ldw 8($ctx),$C + ldw 12($ctx),$D + ldw 16($ctx),$E + + extru $inp,31,2,$t0 ; t0=inp&3; + sh3addl $t0,%r0,$t0 ; t0*=8; + subi 32,$t0,$t0 ; t0=32-t0; + mtctl $t0,%cr11 ; %sar=t0; + +L\$oop + ldi 3,$t0 + andcm $inp,$t0,$t0 ; 64-bit neutral +___ + for ($i=0;$i<15;$i++) { # load input block + $code.="\tldw `4*$i`($t0),@X[$i]\n"; } +$code.=<<___; + cmpb,*= $inp,$t0,L\$aligned + ldw 60($t0),@X[15] + ldw 64($t0),@X[16] +___ + for ($i=0;$i<16;$i++) { # align input + $code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; } +$code.=<<___; +L\$aligned + ldil L'0x5a827000,$K ; K_00_19 + ldo 0x999($K),$K +___ +for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + ldil L'0x6ed9e000,$K ; K_20_39 + ldo 0xba1($K),$K +___ + +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + ldil L'0x8f1bb000,$K ; K_40_59 + ldo 0xcdc($K),$K +___ + +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + ldil L'0xca62c000,$K ; K_60_79 + ldo 0x1d6($K),$K +___ +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + addl @X[0],$A,$A + addl @X[1],$B,$B + addl @X[2],$C,$C + addl @X[3],$D,$D + addl @X[4],$E,$E + stw $A,0($ctx) + stw $B,4($ctx) + stw $C,8($ctx) + stw $D,12($ctx) + stw $E,16($ctx) + addib,*<> -1,$num,L\$oop + ldo 64($inp),$inp + + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + .STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by " +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/,\*/,/gm if ($SIZE_T==4); +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha512-parisc.pl b/crypto/sha/asm/sha512-parisc.pl new file mode 100755 index 0000000000..e24ee58ae9 --- /dev/null +++ b/crypto/sha/asm/sha512-parisc.pl @@ -0,0 +1,791 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA256/512 block procedure for PA-RISC. + +# June 2009. +# +# SHA256 performance is >75% better than gcc 3.2 generated code on +# PA-7100LC. Compared to code generated by vendor compiler this +# implementation is almost 70% faster in 64-bit build, but delivers +# virtually same performance in 32-bit build on PA-8600. +# +# SHA512 performance is >2.9x better than gcc 3.2 generated code on +# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the +# code is executed on PA-RISC 2.0 processor and switches to 64-bit +# code path delivering adequate peformance even in "blended" 32-bit +# build. Though 64-bit code is not any faster than code generated by +# vendor compiler on PA-8600... +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +if ($output =~ /512/) { + $func="sha512_block_data_order"; + $SZ=8; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; + $LAST10BITS=0x017; + $LD="ldd"; + $LDM="ldd,ma"; + $ST="std"; +} else { + $func="sha256_block_data_order"; + $SZ=4; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; + $LAST10BITS=0x0f2; + $LD="ldw"; + $LDM="ldwm"; + $ST="stw"; +} + +$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker + # [+ argument transfer] +$XOFF=16*$SZ+32; # local variables +$FRAME+=$XOFF; +$XOFF+=$FRAME_MARKER; # distance between %sp and local variables + +$ctx="%r26"; # zapped by $a0 +$inp="%r25"; # zapped by $a1 +$num="%r24"; # zapped by $t0 + +$a0 ="%r26"; +$a1 ="%r25"; +$t0 ="%r24"; +$t1 ="%r29"; +$Tbl="%r31"; + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28"); + +@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", + "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp); + +sub ROUND_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +$code.=<<___; + _ror $e,$Sigma1[0],$a0 + and $f,$e,$t0 + _ror $e,$Sigma1[1],$a1 + addl $t1,$h,$h + andcm $g,$e,$t1 + xor $a1,$a0,$a0 + _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1 + or $t0,$t1,$t1 ; Ch(e,f,g) + addl @X[$i%16],$h,$h + xor $a0,$a1,$a1 ; Sigma1(e) + addl $t1,$h,$h + _ror $a,$Sigma0[0],$a0 + addl $a1,$h,$h + + _ror $a,$Sigma0[1],$a1 + and $a,$b,$t0 + and $a,$c,$t1 + xor $a1,$a0,$a0 + _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1 + xor $t1,$t0,$t0 + and $b,$c,$t1 + xor $a0,$a1,$a1 ; Sigma0(a) + addl $h,$d,$d + xor $t1,$t0,$t0 ; Maj(a,b,c) + `"$LDM $SZ($Tbl),$t1" if ($i<15)` + addl $a1,$h,$h + addl $t0,$h,$h + +___ +} + +sub ROUND_16_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +$i-=16; +$code.=<<___; + _ror @X[($i+1)%16],$sigma0[0],$a0 + _ror @X[($i+1)%16],$sigma0[1],$a1 + addl @X[($i+9)%16],@X[$i],@X[$i] + _ror @X[($i+14)%16],$sigma1[0],$t0 + _ror @X[($i+14)%16],$sigma1[1],$t1 + xor $a1,$a0,$a0 + _shr @X[($i+1)%16],$sigma0[2],$a1 + xor $t1,$t0,$t0 + _shr @X[($i+14)%16],$sigma1[2],$t1 + xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f]) + xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f]) + $LDM $SZ($Tbl),$t1 + addl $a0,@X[$i],@X[$i] + addl $t0,@X[$i],@X[$i] +___ +$code.=<<___ if ($i==15); + extru $t1,31,10,$a1 + comiclr,<> $LAST10BITS,$a1,%r0 + ldo 1($Tbl),$Tbl ; signal end of $Tbl +___ +&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h); +} + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .ALIGN 64 +L\$table +___ +$code.=<<___ if ($SZ==8); + .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd + .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc + .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019 + .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118 + .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe + .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2 + .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1 + .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694 + .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3 + .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65 + .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483 + .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5 + .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210 + .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4 + .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725 + .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70 + .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926 + .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df + .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8 + .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b + .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001 + .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30 + .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910 + .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8 + .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53 + .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8 + .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb + .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3 + .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60 + .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec + .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9 + .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b + .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207 + .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178 + .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6 + .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b + .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493 + .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c + .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a + .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817 +___ +$code.=<<___ if ($SZ==4); + .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +___ +$code.=<<___; + + .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 64 +$func + .PROC + .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) + $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) + + _shl $num,`log(16*$SZ)/log(2)`,$num + addl $inp,$num,$num ; $num to point at the end of $inp + + $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments + $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) + $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp) + + blr %r0,$Tbl + ldi 3,$t1 +L\$pic + andcm $Tbl,$t1,$Tbl ; wipe privilege level + ldo L\$table-L\$pic($Tbl),$Tbl +___ +$code.=<<___ if ($SZ==8 && $SIZE_T==4); + ldi 31,$t1 + mtctl $t1,%cr11 + extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0 + b L\$parisc1 + nop +___ +$code.=<<___; + $LD `0*$SZ`($ctx),$A ; load context + $LD `1*$SZ`($ctx),$B + $LD `2*$SZ`($ctx),$C + $LD `3*$SZ`($ctx),$D + $LD `4*$SZ`($ctx),$E + $LD `5*$SZ`($ctx),$F + $LD `6*$SZ`($ctx),$G + $LD `7*$SZ`($ctx),$H + + extru $inp,31,`log($SZ)/log(2)`,$t0 + sh3addl $t0,%r0,$t0 + subi `8*$SZ`,$t0,$t0 + mtctl $t0,%cr11 ; load %sar with align factor + +L\$oop + ldi `$SZ-1`,$t0 + $LDM $SZ($Tbl),$t1 + andcm $inp,$t0,$t0 ; align $inp +___ + for ($i=0;$i<15;$i++) { # load input block + $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; } +$code.=<<___; + cmpb,*= $inp,$t0,L\$aligned + $LD `$SZ*15`($t0),@X[15] + $LD `$SZ*16`($t0),@X[16] +___ + for ($i=0;$i<16;$i++) { # align data + $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; } +$code.=<<___; +L\$aligned + nop ; otherwise /usr/ccs/bin/as is confused by below .WORD +___ + +for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +L\$rounds + nop ; otherwise /usr/ccs/bin/as is confused by below .WORD +___ +for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled? + nop + + $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments + $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp + $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num + ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl + + $LD `0*$SZ`($ctx),@X[0] ; load context + $LD `1*$SZ`($ctx),@X[1] + $LD `2*$SZ`($ctx),@X[2] + $LD `3*$SZ`($ctx),@X[3] + $LD `4*$SZ`($ctx),@X[4] + $LD `5*$SZ`($ctx),@X[5] + addl @X[0],$A,$A + $LD `6*$SZ`($ctx),@X[6] + addl @X[1],$B,$B + $LD `7*$SZ`($ctx),@X[7] + ldo `16*$SZ`($inp),$inp ; advance $inp + + $ST $A,`0*$SZ`($ctx) ; save context + addl @X[2],$C,$C + $ST $B,`1*$SZ`($ctx) + addl @X[3],$D,$D + $ST $C,`2*$SZ`($ctx) + addl @X[4],$E,$E + $ST $D,`3*$SZ`($ctx) + addl @X[5],$F,$F + $ST $E,`4*$SZ`($ctx) + addl @X[6],$G,$G + $ST $F,`5*$SZ`($ctx) + addl @X[7],$H,$H + $ST $G,`6*$SZ`($ctx) + $ST $H,`7*$SZ`($ctx) + + cmpb,*<>,n $inp,$num,L\$oop + $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp +___ +if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0 +{{ +$code.=<<___; + b L\$done + nop + + .ALIGN 64 +L\$parisc1 +___ + +@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo, + $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) = + ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", + "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16"); +$a0 ="%r17"; +$a1 ="%r18"; +$a2 ="%r19"; +$a3 ="%r20"; +$t0 ="%r21"; +$t1 ="%r22"; +$t2 ="%r28"; +$t3 ="%r29"; +$Tbl="%r31"; + +@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx + +sub ROUND_00_15_pa1 { +my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, + $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_; +my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; + +$code.=<<___ if (!$flag); + ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi + ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] +___ +$code.=<<___; + shd $ehi,$elo,$Sigma1[0],$t0 + add $Xlo,$hlo,$hlo + shd $elo,$ehi,$Sigma1[0],$t1 + addc $Xhi,$hhi,$hhi ; h += X[i] + shd $ehi,$elo,$Sigma1[1],$t2 + ldwm 8($Tbl),$Xhi + shd $elo,$ehi,$Sigma1[1],$t3 + ldw -4($Tbl),$Xlo ; load K[i] + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 + and $flo,$elo,$a0 + and $fhi,$ehi,$a1 + shd $ehi,$elo,$Sigma1[2],$t2 + andcm $glo,$elo,$a2 + shd $elo,$ehi,$Sigma1[2],$t3 + andcm $ghi,$ehi,$a3 + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 ; Sigma1(e) + add $Xlo,$hlo,$hlo + xor $a2,$a0,$a0 + addc $Xhi,$hhi,$hhi ; h += K[i] + xor $a3,$a1,$a1 ; Ch(e,f,g) + + add $t0,$hlo,$hlo + shd $ahi,$alo,$Sigma0[0],$t0 + addc $t1,$hhi,$hhi ; h += Sigma1(e) + shd $alo,$ahi,$Sigma0[0],$t1 + add $a0,$hlo,$hlo + shd $ahi,$alo,$Sigma0[1],$t2 + addc $a1,$hhi,$hhi ; h += Ch(e,f,g) + shd $alo,$ahi,$Sigma0[1],$t3 + + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 + shd $ahi,$alo,$Sigma0[2],$t2 + and $alo,$blo,$a0 + shd $alo,$ahi,$Sigma0[2],$t3 + and $ahi,$bhi,$a1 + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 ; Sigma0(a) + + and $alo,$clo,$a2 + and $ahi,$chi,$a3 + xor $a2,$a0,$a0 + add $hlo,$dlo,$dlo + xor $a3,$a1,$a1 + addc $hhi,$dhi,$dhi ; d += h + and $blo,$clo,$a2 + add $t0,$hlo,$hlo + and $bhi,$chi,$a3 + addc $t1,$hhi,$hhi ; h += Sigma0(a) + xor $a2,$a0,$a0 + add $a0,$hlo,$hlo + xor $a3,$a1,$a1 ; Maj(a,b,c) + addc $a1,$hhi,$hhi ; h += Maj(a,b,c) + +___ +$code.=<<___ if ($i==15 && $flag); + extru $Xlo,31,10,$Xlo + comiclr,= $LAST10BITS,$Xlo,%r0 + b L\$rounds_pa1 + nop +___ +push(@X,shift(@X)); push(@X,shift(@X)); +} + +sub ROUND_16_xx_pa1 { +my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; +my ($i)=shift; +$i-=16; +$code.=<<___; + ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi + ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] + ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1 + ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9] + ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3 + ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14] + shd $Xnhi,$Xnlo,$sigma0[0],$t0 + shd $Xnlo,$Xnhi,$sigma0[0],$t1 + add $a0,$Xlo,$Xlo + shd $Xnhi,$Xnlo,$sigma0[1],$t2 + addc $a1,$Xhi,$Xhi + shd $Xnlo,$Xnhi,$sigma0[1],$t3 + xor $t2,$t0,$t0 + shd $Xnhi,$Xnlo,$sigma0[2],$t2 + xor $t3,$t1,$t1 + extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3 + xor $t2,$t0,$t0 + shd $a3,$a2,$sigma1[0],$a0 + xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f]) + shd $a2,$a3,$sigma1[0],$a1 + add $t0,$Xlo,$Xlo + shd $a3,$a2,$sigma1[1],$t2 + addc $t1,$Xhi,$Xhi + shd $a2,$a3,$sigma1[1],$t3 + xor $t2,$a0,$a0 + shd $a3,$a2,$sigma1[2],$t2 + xor $t3,$a1,$a1 + extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3 + xor $t2,$a0,$a0 + xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f]) + add $a0,$Xlo,$Xlo + addc $a1,$Xhi,$Xhi + + stw $Xhi,`-$XOFF+8*($i%16)`(%sp) + stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp) +___ +&ROUND_00_15_pa1($i,@_,1); +} +$code.=<<___; + ldw `0*4`($ctx),$Ahi ; load context + ldw `1*4`($ctx),$Alo + ldw `2*4`($ctx),$Bhi + ldw `3*4`($ctx),$Blo + ldw `4*4`($ctx),$Chi + ldw `5*4`($ctx),$Clo + ldw `6*4`($ctx),$Dhi + ldw `7*4`($ctx),$Dlo + ldw `8*4`($ctx),$Ehi + ldw `9*4`($ctx),$Elo + ldw `10*4`($ctx),$Fhi + ldw `11*4`($ctx),$Flo + ldw `12*4`($ctx),$Ghi + ldw `13*4`($ctx),$Glo + ldw `14*4`($ctx),$Hhi + ldw `15*4`($ctx),$Hlo + + extru $inp,31,2,$t0 + sh3addl $t0,%r0,$t0 + subi 32,$t0,$t0 + mtctl $t0,%cr11 ; load %sar with align factor + +L\$oop_pa1 + extru $inp,31,2,$a3 + comib,= 0,$a3,L\$aligned_pa1 + sub $inp,$a3,$inp + + ldw `0*4`($inp),$X[0] + ldw `1*4`($inp),$X[1] + ldw `2*4`($inp),$t2 + ldw `3*4`($inp),$t3 + ldw `4*4`($inp),$a0 + ldw `5*4`($inp),$a1 + ldw `6*4`($inp),$a2 + ldw `7*4`($inp),$a3 + vshd $X[0],$X[1],$X[0] + vshd $X[1],$t2,$X[1] + stw $X[0],`-$XOFF+0*4`(%sp) + ldw `8*4`($inp),$t0 + vshd $t2,$t3,$t2 + stw $X[1],`-$XOFF+1*4`(%sp) + ldw `9*4`($inp),$t1 + vshd $t3,$a0,$t3 +___ +{ +my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); +for ($i=2;$i<=(128/4-8);$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) + ldw `(8+$i)*4`($inp),$t[0] + vshd $t[1],$t[2],$t[1] +___ +push(@t,shift(@t)); +} +for (;$i<(128/4-1);$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) + vshd $t[1],$t[2],$t[1] +___ +push(@t,shift(@t)); +} +$code.=<<___; + b L\$collected_pa1 + stw $t[0],`-$XOFF+$i*4`(%sp) + +___ +} +$code.=<<___; +L\$aligned_pa1 + ldw `0*4`($inp),$X[0] + ldw `1*4`($inp),$X[1] + ldw `2*4`($inp),$t2 + ldw `3*4`($inp),$t3 + ldw `4*4`($inp),$a0 + ldw `5*4`($inp),$a1 + ldw `6*4`($inp),$a2 + ldw `7*4`($inp),$a3 + stw $X[0],`-$XOFF+0*4`(%sp) + ldw `8*4`($inp),$t0 + stw $X[1],`-$XOFF+1*4`(%sp) + ldw `9*4`($inp),$t1 +___ +{ +my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); +for ($i=2;$i<(128/4-8);$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) + ldw `(8+$i)*4`($inp),$t[0] +___ +push(@t,shift(@t)); +} +for (;$i<128/4;$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) +___ +push(@t,shift(@t)); +} +$code.="L\$collected_pa1\n"; +} + +for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } +$code.="L\$rounds_pa1\n"; +for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } + +$code.=<<___; + $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments + $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp + $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num + ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl + + ldw `0*4`($ctx),$t1 ; update context + ldw `1*4`($ctx),$t0 + ldw `2*4`($ctx),$t3 + ldw `3*4`($ctx),$t2 + ldw `4*4`($ctx),$a1 + ldw `5*4`($ctx),$a0 + ldw `6*4`($ctx),$a3 + add $t0,$Alo,$Alo + ldw `7*4`($ctx),$a2 + addc $t1,$Ahi,$Ahi + ldw `8*4`($ctx),$t1 + add $t2,$Blo,$Blo + ldw `9*4`($ctx),$t0 + addc $t3,$Bhi,$Bhi + ldw `10*4`($ctx),$t3 + add $a0,$Clo,$Clo + ldw `11*4`($ctx),$t2 + addc $a1,$Chi,$Chi + ldw `12*4`($ctx),$a1 + add $a2,$Dlo,$Dlo + ldw `13*4`($ctx),$a0 + addc $a3,$Dhi,$Dhi + ldw `14*4`($ctx),$a3 + add $t0,$Elo,$Elo + ldw `15*4`($ctx),$a2 + addc $t1,$Ehi,$Ehi + stw $Ahi,`0*4`($ctx) + add $t2,$Flo,$Flo + stw $Alo,`1*4`($ctx) + addc $t3,$Fhi,$Fhi + stw $Bhi,`2*4`($ctx) + add $a0,$Glo,$Glo + stw $Blo,`3*4`($ctx) + addc $a1,$Ghi,$Ghi + stw $Chi,`4*4`($ctx) + add $a2,$Hlo,$Hlo + stw $Clo,`5*4`($ctx) + addc $a3,$Hhi,$Hhi + stw $Dhi,`6*4`($ctx) + ldo `16*$SZ`($inp),$inp ; advance $inp + stw $Dlo,`7*4`($ctx) + stw $Ehi,`8*4`($ctx) + stw $Elo,`9*4`($ctx) + stw $Fhi,`10*4`($ctx) + stw $Flo,`11*4`($ctx) + stw $Ghi,`12*4`($ctx) + stw $Glo,`13*4`($ctx) + stw $Hhi,`14*4`($ctx) + comb,= $inp,$num,L\$done + stw $Hlo,`15*4`($ctx) + b L\$oop_pa1 + $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp +L\$done +___ +}} +$code.=<<___; + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 + $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by " +___ + +# Explicitly encode PA-RISC 2.0 instructions used in this module, so +# that it can be compiled with .LEVEL 1.0. It should be noted that I +# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 +# directive... + +my $ldd = sub { + my ($mod,$args) = @_; + my $orig = "ldd$mod\t$args"; + + if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices + { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1); + $opcode|=(1<<3) if ($mod =~ /^,m/); + $opcode|=(1<<2) if ($mod =~ /^,mb/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $std = sub { + my ($mod,$args) = @_; + my $orig = "std$mod\t$args"; + + if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices + { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $extrd = sub { + my ($mod,$args) = @_; + my $orig = "extrd$mod\t$args"; + + # I only have ",u" completer, it's implicitly encoded... + if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 + { my $opcode=(0x36<<26)|($1<<21)|($4<<16); + my $len=32-$3; + $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos + $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 + { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); + my $len=32-$2; + $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len + $opcode |= (1<<13) if ($mod =~ /,\**=/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $shrpd = sub { + my ($mod,$args) = @_; + my $orig = "shrpd$mod\t$args"; + + if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 + { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; + my $cpos=63-$3; + $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 + { sprintf "\t.WORD\t0x%08x\t; %s", + (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; + } + else { "\t".$orig; } +}; + +sub assemble { + my ($mnemonic,$mod,$args)=@_; + my $opcode = eval("\$$mnemonic"); + + ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/ + $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32 + : sprintf("shd\t%$1,%$2,%d",$3)/e or + # translate made up instructons: _ror, _shr, _align, _shl + s/_ror(\s+)(%r[0-9]+),/ + ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or + + s/_shr(\s+%r[0-9]+),([0-9]+),/ + $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2) + : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or + + s/_align(\s+%r[0-9]+,%r[0-9]+),/ + ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or + + s/_shl(\s+%r[0-9]+),([0-9]+),/ + $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2) + : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e; + + s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4); + + s/cmpb,\*/comb,/ if ($SIZE_T==4); + + print $_,"\n"; +} + +close STDOUT; -- 2.34.1