From: Andy Polyakov Date: Tue, 10 Nov 2015 20:11:24 +0000 (+0100) Subject: bn/asm/s390x.S: improve performance on z196 and z13 by up to 26%. [even z10 is couple... X-Git-Tag: OpenSSL_1_0_2e~28 X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=commitdiff_plain;h=c6a5fddf6c92099cc174ee48f763e56ef9fe8609;ds=sidebyside bn/asm/s390x.S: improve performance on z196 and z13 by up to 26%. [even z10 is couple percent faster]. Triggered by RT#4128, but solves the problem by real modulo-scheduling. Reviewed-by: Rich Salz (cherry picked from commit 9d0e4dc6351df7d0c08400c4b4cf17c017022e50) --- diff --git a/crypto/bn/asm/s390x.S b/crypto/bn/asm/s390x.S index 43fcb79bc0..f5eebe413a 100755 --- a/crypto/bn/asm/s390x.S +++ b/crypto/bn/asm/s390x.S @@ -18,71 +18,106 @@ .align 4 bn_mul_add_words: lghi zero,0 // zero = 0 - la %r1,0(%r2) // put rp aside - lghi %r2,0 // i=0; + la %r1,0(%r2) // put rp aside [to give way to] + lghi %r2,0 // return value ltgfr %r4,%r4 bler %r14 // if (len<=0) return 0; - stmg %r6,%r10,48(%r15) - lghi %r10,3 - lghi %r8,0 // carry = 0 - nr %r10,%r4 // len%4 + stmg %r6,%r13,48(%r15) + lghi %r2,3 + lghi %r12,0 // carry = 0 + slgr %r1,%r3 // rp-=ap + nr %r2,%r4 // len%4 sra %r4,2 // cnt=len/4 jz .Loop1_madd // carry is incidentally cleared if branch taken algr zero,zero // clear carry -.Loop4_madd: - lg %r7,0(%r2,%r3) // ap[i] + lg %r7,0(%r3) // ap[0] + lg %r9,8(%r3) // ap[1] mlgr %r6,%r5 // *=w - alcgr %r7,%r8 // +=carry - alcgr %r6,zero - alg %r7,0(%r2,%r1) // +=rp[i] - stg %r7,0(%r2,%r1) // rp[i]= + brct %r4,.Loop4_madd + j .Loop4_madd_tail - lg %r9,8(%r2,%r3) +.Loop4_madd: mlgr %r8,%r5 + lg %r11,16(%r3) // ap[i+2] + alcgr %r7,%r12 // +=carry + alcgr %r6,zero + alg %r7,0(%r3,%r1) // +=rp[i] + stg %r7,0(%r3,%r1) // rp[i]= + + mlgr %r10,%r5 + lg %r13,24(%r3) alcgr %r9,%r6 alcgr %r8,zero - alg %r9,8(%r2,%r1) - stg %r9,8(%r2,%r1) + alg %r9,8(%r3,%r1) + stg %r9,8(%r3,%r1) + + mlgr %r12,%r5 + lg %r7,32(%r3) + alcgr %r11,%r8 + alcgr %r10,zero + alg %r11,16(%r3,%r1) + stg %r11,16(%r3,%r1) - lg %r7,16(%r2,%r3) mlgr %r6,%r5 - alcgr %r7,%r8 - alcgr %r6,zero - alg %r7,16(%r2,%r1) - stg %r7,16(%r2,%r1) + lg %r9,40(%r3) + alcgr %r13,%r10 + alcgr %r12,zero + alg %r13,24(%r3,%r1) + stg %r13,24(%r3,%r1) - lg %r9,24(%r2,%r3) + la %r3,32(%r3) // i+=4 + brct %r4,.Loop4_madd + +.Loop4_madd_tail: mlgr %r8,%r5 + lg %r11,16(%r3) + alcgr %r7,%r12 // +=carry + alcgr %r6,zero + alg %r7,0(%r3,%r1) // +=rp[i] + stg %r7,0(%r3,%r1) // rp[i]= + + mlgr %r10,%r5 + lg %r13,24(%r3) alcgr %r9,%r6 alcgr %r8,zero - alg %r9,24(%r2,%r1) - stg %r9,24(%r2,%r1) + alg %r9,8(%r3,%r1) + stg %r9,8(%r3,%r1) - la %r2,32(%r2) // i+=4 - brct %r4,.Loop4_madd + mlgr %r12,%r5 + alcgr %r11,%r8 + alcgr %r10,zero + alg %r11,16(%r3,%r1) + stg %r11,16(%r3,%r1) - la %r10,1(%r10) // see if len%4 is zero ... - brct %r10,.Loop1_madd // without touching condition code:-) + alcgr %r13,%r10 + alcgr %r12,zero + alg %r13,24(%r3,%r1) + stg %r13,24(%r3,%r1) + + la %r3,32(%r3) // i+=4 + + la %r2,1(%r2) // see if len%4 is zero ... + brct %r2,.Loop1_madd // without touching condition code:-) .Lend_madd: - alcgr %r8,zero // collect carry bit - lgr %r2,%r8 - lmg %r6,%r10,48(%r15) + lgr %r2,zero // return value + alcgr %r2,%r12 // collect even carry bit + lmg %r6,%r13,48(%r15) br %r14 .Loop1_madd: - lg %r7,0(%r2,%r3) // ap[i] + lg %r7,0(%r3) // ap[i] mlgr %r6,%r5 // *=w - alcgr %r7,%r8 // +=carry + alcgr %r7,%r12 // +=carry alcgr %r6,zero - alg %r7,0(%r2,%r1) // +=rp[i] - stg %r7,0(%r2,%r1) // rp[i]= + alg %r7,0(%r3,%r1) // +=rp[i] + stg %r7,0(%r3,%r1) // rp[i]= - lgr %r8,%r6 - la %r2,8(%r2) // i++ - brct %r10,.Loop1_madd + lgr %r12,%r6 + la %r3,8(%r3) // i++ + brct %r2,.Loop1_madd j .Lend_madd .size bn_mul_add_words,.-bn_mul_add_words