From: Andy Polyakov Date: Mon, 25 Jan 2016 22:41:01 +0000 (+0100) Subject: crypto/bn/x86_64-mont5.pl: constant-time gather procedure. X-Git-Tag: OpenSSL_1_1_0-pre4~429 X-Git-Url: https://git.openssl.org/?p=openssl.git;a=commitdiff_plain;h=8fc8f486f7fa098c9fbb6a6ae399e3c6856e0d87 crypto/bn/x86_64-mont5.pl: constant-time gather procedure. At the same time remove miniscule bias in final subtraction. Performance penalty varies from platform to platform, and even with key length. For rsa2048 sign it was observed to be 4% for Sandy Bridge and 7% on Broadwell. CVE-2016-0702 Reviewed-by: Richard Levitte Reviewed-by: Rich Salz --- diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index e82e451388..69232cbeb6 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -775,20 +775,20 @@ bn_sqr8x_mont: # 4096. this is done to allow memory disambiguation logic # do its job. # - lea -64(%rsp,$num,4),%r11 + lea -64(%rsp,$num,2),%r11 mov ($n0),$n0 # *n0 sub $aptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lsqr8x_sp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num) + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: - lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num - lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num) + lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -798,37 +798,17 @@ bn_sqr8x_mont: mov $num,%r10 neg $num - lea 64(%rsp,$num,2),%r11 # copy of modulus mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .Lsqr8x_body: - mov $num,$i - movq %r11, %xmm2 # save pointer to modulus copy - shr \$3+2,$i - mov OPENSSL_ia32cap_P+8(%rip),%eax - jmp .Lsqr8x_copy_n - -.align 32 -.Lsqr8x_copy_n: - movq 8*0($nptr),%xmm0 - movq 8*1($nptr),%xmm1 - movq 8*2($nptr),%xmm3 - movq 8*3($nptr),%xmm4 - lea 8*4($nptr),$nptr - movdqa %xmm0,16*0(%r11) - movdqa %xmm1,16*1(%r11) - movdqa %xmm3,16*2(%r11) - movdqa %xmm4,16*3(%r11) - lea 16*4(%r11),%r11 - dec $i - jnz .Lsqr8x_copy_n - + movq $nptr, %xmm2 # save pointer to modulus pxor %xmm0,%xmm0 movq $rptr,%xmm1 # save $rptr movq %r10, %xmm3 # -$num ___ $code.=<<___ if ($addx); + mov OPENSSL_ia32cap_P+8(%rip),%eax and \$0x80100,%eax cmp \$0x80100,%eax jne .Lsqr8x_nox @@ -837,7 +817,6 @@ $code.=<<___ if ($addx); pxor %xmm0,%xmm0 lea 48(%rsp),%rax - lea 64(%rsp,$num,2),%rdx shr \$3+2,$num mov 40(%rsp),%rsi # restore %rsp jmp .Lsqr8x_zero @@ -850,7 +829,6 @@ $code.=<<___; pxor %xmm0,%xmm0 lea 48(%rsp),%rax - lea 64(%rsp,$num,2),%rdx shr \$3+2,$num mov 40(%rsp),%rsi # restore %rsp jmp .Lsqr8x_zero @@ -862,11 +840,6 @@ $code.=<<___; movdqa %xmm0,16*2(%rax) movdqa %xmm0,16*3(%rax) lea 16*4(%rax),%rax - movdqa %xmm0,16*0(%rdx) # wipe n - movdqa %xmm0,16*1(%rdx) - movdqa %xmm0,16*2(%rdx) - movdqa %xmm0,16*3(%rdx) - lea 16*4(%rdx),%rdx dec $num jnz .Lsqr8x_zero diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 292409c4ff..be91ef09d5 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -99,7 +99,8 @@ $code.=<<___; .Lmul_enter: mov ${num}d,${num}d mov %rsp,%rax - mov `($win64?56:8)`(%rsp),%r10d # load 7th argument + movd `($win64?56:8)`(%rsp),%xmm0 # load 7th argument + lea .Lmagic_masks(%rip),%r10 push %rbx push %rbp push %r12 @@ -108,9 +109,10 @@ $code.=<<___; push %r15 ___ $code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp + lea -0x38(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) ___ $code.=<<___; lea 2($num),%r11 @@ -120,36 +122,69 @@ $code.=<<___; mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .Lmul_body: - mov $bp,%r12 # reassign $bp + lea 128($bp),%r12 # reassign $bp (+size optimization) ___ $bp="%r12"; $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96($bp,%r11,8),$bp # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 + ################################################################ + # calculate mask: one of %xmm4..7 will contain 0xff..00 or + # 0x00..ff denoting which half of a quarter of corresponding + # cache line is significant. + # + movq 56(%r10),%xmm1 # 0b11001 + movq %xmm0,%rdx + pand %xmm1,%xmm0 + movdqa 0(%r10),%xmm4 + pshufd \$0,%xmm0,%xmm0 # broadcast masked index + movdqa 16(%r10),%xmm5 + movdqa 32(%r10),%xmm6 + pcmpeqd %xmm0,%xmm4 + movdqa 48(%r10),%xmm7 + pcmpeqd %xmm0,%xmm5 + pcmpeqd %xmm0,%xmm6 + pcmpeqd %xmm0,%xmm7 + + ################################################################ + # calculate index in 1st cache line, but in such manner that + # if target data is in another cache line, then relevant + # "rotating" reference would land on it... + # + shr \$1,%rdx # idx/=2 + mov %rdx,$j + shr \$2,%rdx + sub %rdx,$j + and \$3,$j # (idx-idx/4)%4 + shl \$4,$j # scale for xmm references + + ################################################################ + # "rotating" references are touching different cache banks in + # different cache lines, so that not only all cache lines are + # referred in each iteration, but even all cache banks. + # + lea 16($j),$m0 + lea 32($j),$m1 + and \$63,$m0 + lea 48($j),%rdx + and \$63,$m1 + and \$63,%rdx + movdqa `0*$STRIDE/4-128`($bp,$j),%xmm0 + movdqa `1*$STRIDE/4-128`($bp,$m0),%xmm1 + movdqa `2*$STRIDE/4-128`($bp,$m1),%xmm2 + movdqa `3*$STRIDE/4-128`($bp,%rdx),%xmm3 pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 pand %xmm6,%xmm2 por %xmm1,%xmm0 pand %xmm7,%xmm3 por %xmm2,%xmm0 lea $STRIDE($bp),$bp por %xmm3,%xmm0 + movq $j,%xmm8 + + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 # merge upper and lower halves movq %xmm0,$m0 # m0=bp[0] @@ -159,29 +194,14 @@ $code.=<<___; xor $i,$i # i=0 xor $j,$j # j=0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$lo0 mov ($np),%rax - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq $lo0,$m1 # "tp[0]"*n0 mov %rdx,$hi0 - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax @@ -214,14 +234,13 @@ $code.=<<___; cmp $num,$j jne .L1st - movq %xmm0,$m0 # bp[1] + movq %xmm8,$j add %rax,$hi1 - mov ($ap),%rax # ap[0] adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov $hi1,-16(%rsp,$num,8) # tp[j-1] mov %rdx,$hi1 mov $lo0,$hi0 @@ -235,33 +254,43 @@ $code.=<<___; jmp .Louter .align 16 .Louter: + lea 16($j),$m0 + lea 32($j),$m1 + and \$63,$m0 + lea 48($j),%rdx + and \$63,$m1 + and \$63,%rdx + movdqa `0*$STRIDE/4-128`($bp,$j),%xmm0 + movdqa `1*$STRIDE/4-128`($bp,$m0),%xmm1 + movdqa `2*$STRIDE/4-128`($bp,$m1),%xmm2 + movdqa `3*$STRIDE/4-128`($bp,%rdx),%xmm3 + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + lea $STRIDE($bp),$bp + por %xmm3,%xmm0 + + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 # merge upper and lower halves + + mov ($ap),%rax # ap[0] + movq %xmm0,$m0 # m0=bp[i] + xor $j,$j # j=0 mov $n0,$m1 mov (%rsp),$lo0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - mulq $m0 # ap[0]*bp[i] add %rax,$lo0 # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq $lo0,$m1 # tp[0]*n0 mov %rdx,$hi0 - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax @@ -297,15 +326,14 @@ $code.=<<___; cmp $num,$j jne .Linner - movq %xmm0,$m0 # bp[i+1] + movq %xmm8,$j add %rax,$hi1 - mov ($ap),%rax # ap[0] adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] - mov (%rsp,$j,8),$lo0 + mov (%rsp,$num,8),$lo0 adc \$0,%rdx - mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov $hi1,-16(%rsp,$num,8) # tp[j-1] mov %rdx,$hi1 xor %rdx,%rdx @@ -354,8 +382,9 @@ $code.=<<___; mov \$1,%rax ___ $code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 + movaps -104(%rsi),%xmm6 + movaps -88(%rsi),%xmm7 + movaps -72(%rsi),%xmm8 ___ $code.=<<___; mov -48(%rsi),%r15 @@ -379,8 +408,8 @@ bn_mul4x_mont_gather5: .Lmul4x_enter: ___ $code.=<<___ if ($addx); - and \$0x80100,%r11d - cmp \$0x80100,%r11d + and \$0x80108,%r11d + cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 je .Lmulx4x_enter ___ $code.=<<___; @@ -392,39 +421,34 @@ $code.=<<___; push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; + .byte 0x67 - mov ${num}d,%r10d - shl \$3,${num}d - shl \$3+2,%r10d # 4*$num + shl \$3,${num}d # convert $num to bytes + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. [excessive frame is allocated in order - # to allow bn_from_mont8x to clear it.] + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra [num] is allocated in order + # to align with bn_power5's frame, which is cleansed after + # completing exponentiation. Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $ap,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmul4xsp_alt - sub %r11,%rsp # align with $ap - lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) + sub %r11,%rsp # align with $rp + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: - lea 4096-64(,$num,2),%r10 - lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -440,12 +464,7 @@ $code.=<<___; mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 mov -40(%rsi),%r14 mov -32(%rsi),%r13 @@ -460,9 +479,10 @@ $code.=<<___; .type mul4x_internal,\@abi-omnipotent .align 32 mul4x_internal: - shl \$5,$num - mov `($win64?56:8)`(%rax),%r10d # load 7th argument - lea 256(%rdx,$num),%r13 + shl \$5,$num # $num was in bytes + movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index + lea .Linc(%rip),%rax + lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) shr \$5,$num # restore $num ___ $bp="%r12"; @@ -470,44 +490,92 @@ ___ $N=$STRIDE/4; # should match cache line size $tp=$i; $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - add \$7,%r11 - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - and \$7,%r11 - - movq `0*$STRIDE/4-96`($bp),%xmm0 - lea $STRIDE($bp),$tp # borrow $tp - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - .byte 0x67 - por %xmm1,%xmm0 - movq `0*$STRIDE/4-96`($tp),%xmm1 - .byte 0x67 - pand %xmm7,%xmm3 - .byte 0x67 - por %xmm2,%xmm0 - movq `1*$STRIDE/4-96`($tp),%xmm2 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) + lea 128(%rdx),$bp # size optimization + + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + .byte 0x67,0x67 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to index and save result to stack +# +$code.=<<___; + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 .byte 0x67 - pand %xmm4,%xmm1 + movdqa %xmm4,%xmm3 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($i+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($i+0)+112`(%r10) + + paddd %xmm2,%xmm3 .byte 0x67 - por %xmm3,%xmm0 - movq `2*$STRIDE/4-96`($tp),%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($i+1)+112`(%r10) + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($i+2)+112`(%r10) + pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register + + pand `16*($i+1)-128`($bp),%xmm1 + pand `16*($i+2)-128`($bp),%xmm2 + movdqa %xmm3,`16*($i+3)+112`(%r10) + pand `16*($i+3)-128`($bp),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bp),%xmm4 + movdqa `16*($i+1)-128`($bp),%xmm5 + movdqa `16*($i+2)-128`($bp),%xmm2 + pand `16*($i+0)+112`(%r10),%xmm4 + movdqa `16*($i+3)-128`($bp),%xmm3 + pand `16*($i+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($i+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($i+3)+112`(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + por %xmm1,%xmm0 + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[0] - movq `3*$STRIDE/4-96`($tp),%xmm0 + mov %r13,16+8(%rsp) # save end of b[num] mov $rp, 56+8(%rsp) # save $rp @@ -521,26 +589,10 @@ $code.=<<___; mov %rax,$A[0] mov ($np),%rax - pand %xmm5,%xmm2 - pand %xmm6,%xmm3 - por %xmm2,%xmm1 - imulq $A[0],$m1 # "tp[0]"*n0 - ############################################################## - # $tp is chosen so that writing to top-most element of the - # vector occurs just "above" references to powers table, - # "above" modulo cache-line size, which effectively precludes - # possibility of memory disambiguation logic failure when - # accessing the table. - # - lea 64+8(%rsp,%r11,8),$tp + lea 64+8(%rsp),$tp mov %rdx,$A[1] - pand %xmm7,%xmm0 - por %xmm3,%xmm1 - lea 2*$STRIDE($bp),$bp - por %xmm1,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$A[0] # discarded mov 8($ap,$num),%rax @@ -549,7 +601,7 @@ $code.=<<___; mulq $m0 add %rax,$A[1] - mov 16*1($np),%rax # interleaved with 0, therefore 16*n + mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -559,7 +611,7 @@ $code.=<<___; adc \$0,%rdx add $A[1],$N[1] lea 4*8($num),$j # j=4 - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) mov %rdx,$N[0] @@ -569,7 +621,7 @@ $code.=<<___; .L1st4x: mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] @@ -585,7 +637,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov -16*1($np),%rax + mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -600,7 +652,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov 16*0($np),%rax + mov 8*0($np),%rax adc \$0,%rdx mov %rdx,$A[1] @@ -615,7 +667,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov 16*1($np),%rax + mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -624,7 +676,7 @@ $code.=<<___; mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) # tp[j-1] mov %rdx,$N[0] @@ -634,7 +686,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] @@ -650,7 +702,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov -16*1($np),%rax + mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -663,8 +715,7 @@ $code.=<<___; mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] - movq %xmm0,$m0 # bp[1] - lea ($np,$num,2),$np # rewind $np + lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] @@ -675,6 +726,33 @@ $code.=<<___; .align 32 .Louter4x: + lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bp),%xmm0 + movdqa `16*($i+1)-128`($bp),%xmm1 + movdqa `16*($i+2)-128`($bp),%xmm2 + movdqa `16*($i+3)-128`($bp),%xmm3 + pand `16*($i+0)-128`(%rdx),%xmm0 + pand `16*($i+1)-128`(%rdx),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)-128`(%rdx),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)-128`(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + lea $STRIDE($bp),$bp + movq %xmm0,$m0 # m0=bp[i] + mov ($tp,$num),$A[0] mov $n0,$m1 mulq $m0 # ap[0]*bp[i] @@ -682,25 +760,11 @@ $code.=<<___; mov ($np),%rax adc \$0,%rdx - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 - imulq $A[0],$m1 # tp[0]*n0 - .byte 0x67 mov %rdx,$A[1] mov $N[1],($tp) # store upmost overflow bit - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 lea ($tp,$num),$tp # rewind $tp - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 mulq $m1 # np[0]*m1 add %rax,$A[0] # "$N[0]", discarded @@ -710,7 +774,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov 16*1($np),%rax # interleaved with 0, therefore 16*n + mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] # +tp[1] adc \$0,%rdx @@ -722,7 +786,7 @@ $code.=<<___; adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] lea 4*8($num),$j # j=4 - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov %rdx,$N[0] jmp .Linner4x @@ -731,7 +795,7 @@ $code.=<<___; .Linner4x: mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp @@ -749,7 +813,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov -16*1($np),%rax + mov -8*1($np),%rax adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx @@ -766,7 +830,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov 16*0($np),%rax + mov 8*0($np),%rax adc \$0,%rdx add ($tp),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx @@ -783,7 +847,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov 16*1($np),%rax + mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] adc \$0,%rdx @@ -794,7 +858,7 @@ $code.=<<___; mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[0] @@ -804,7 +868,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp @@ -823,7 +887,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov $m1,%rax - mov -16*1($np),$m1 + mov -8*1($np),$m1 adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx @@ -838,9 +902,8 @@ $code.=<<___; mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] - movq %xmm0,$m0 # bp[i+1] mov $N[1],-16($tp) # tp[j-1] - lea ($np,$num,2),$np # rewind $np + lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] @@ -854,16 +917,23 @@ $code.=<<___; ___ if (1) { $code.=<<___; + xor %rax,%rax sub $N[0],$m1 # compare top-most words adc $j,$j # $j is zero or $j,$N[1] - xor \$1,$N[1] + sub $N[1],%rax # %rax=-$N[1] lea ($tp,$num),%rbx # tptr in .sqr4x_sub - lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub + mov ($np),%r12 + lea ($np),%rbp # nptr in .sqr4x_sub mov %r9,%rcx - sar \$3+2,%rcx # cf=0 + sar \$3+2,%rcx mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub - jmp .Lsqr4x_sub + dec %r12 # so that after 'not' we get -n[0] + xor %r10,%r10 + mov 8*1(%rbp),%r13 + mov 8*2(%rbp),%r14 + mov 8*3(%rbp),%r15 + jmp .Lsqr4x_sub_entry ___ } else { my @ri=("%rax",$bp,$m0,$m1); @@ -930,8 +1000,8 @@ bn_power5: ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d - and \$0x80100,%r11d - cmp \$0x80100,%r11d + and \$0x80108,%r11d + cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 je .Lpowerx5_enter ___ $code.=<<___; @@ -942,38 +1012,32 @@ $code.=<<___; push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10d # 3*$num neg $num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $aptr,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwr_sp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -995,9 +1059,9 @@ $code.=<<___; mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .Lpower5_body: - movq $rptr,%xmm1 # save $rptr + movq $rptr,%xmm1 # save $rptr, used in sqr8x movq $nptr,%xmm2 # save $nptr - movq %r10, %xmm3 # -$num + movq %r10, %xmm3 # -$num, used in sqr8x movq $bptr,%xmm4 call __bn_sqr8x_internal @@ -1567,7 +1631,7 @@ $code.=<<___; movq %xmm2,$nptr sqr8x_reduction: xor %rax,%rax - lea ($nptr,$num,2),%rcx # end of n[] + lea ($nptr,$num),%rcx # end of n[] lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer mov %rcx,0+8(%rsp) lea 48+8(%rsp,$num),$tptr # end of initial t[] window @@ -1593,21 +1657,21 @@ sqr8x_reduction: .byte 0x67 mov $m0,%r8 imulq 32+8(%rsp),$m0 # n0*a[0] - mov 16*0($nptr),%rax # n[0] + mov 8*0($nptr),%rax # n[0] mov \$8,%ecx jmp .L8x_reduce .align 32 .L8x_reduce: mulq $m0 - mov 16*1($nptr),%rax # n[1] + mov 8*1($nptr),%rax # n[1] neg %r8 mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 - mov 16*2($nptr),%rax + mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] @@ -1616,7 +1680,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r10 - mov 16*3($nptr),%rax + mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov 32+8(%rsp),$carry # pull n0, borrow $carry @@ -1625,7 +1689,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r11 - mov 16*4($nptr),%rax + mov 8*4($nptr),%rax adc \$0,%rdx imulq %r8,$carry # modulo-scheduled add %r11,%r10 @@ -1634,7 +1698,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r12 - mov 16*5($nptr),%rax + mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 @@ -1642,7 +1706,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r13 - mov 16*6($nptr),%rax + mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 @@ -1650,7 +1714,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r14 - mov 16*7($nptr),%rax + mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 @@ -1659,7 +1723,7 @@ sqr8x_reduction: mulq $m0 mov $carry,$m0 # n0*a[i] add %rax,%r15 - mov 16*0($nptr),%rax # n[0] + mov 8*0($nptr),%rax # n[0] adc \$0,%rdx add %r15,%r14 mov %rdx,%r15 @@ -1668,7 +1732,7 @@ sqr8x_reduction: dec %ecx jnz .L8x_reduce - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr xor %rax,%rax mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? @@ -1687,21 +1751,21 @@ sqr8x_reduction: mov 48+56+8(%rsp),$m0 # pull n0*a[0] mov \$8,%ecx - mov 16*0($nptr),%rax + mov 8*0($nptr),%rax jmp .L8x_tail .align 32 .L8x_tail: mulq $m0 add %rax,%r8 - mov 16*1($nptr),%rax + mov 8*1($nptr),%rax mov %r8,($tptr) # save result mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 - mov 16*2($nptr),%rax + mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 lea 8($tptr),$tptr # $tptr++ @@ -1710,7 +1774,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r10 - mov 16*3($nptr),%rax + mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov %rdx,%r10 @@ -1718,7 +1782,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r11 - mov 16*4($nptr),%rax + mov 8*4($nptr),%rax adc \$0,%rdx add %r11,%r10 mov %rdx,%r11 @@ -1726,7 +1790,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r12 - mov 16*5($nptr),%rax + mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 @@ -1734,7 +1798,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r13 - mov 16*6($nptr),%rax + mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 @@ -1742,7 +1806,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r14 - mov 16*7($nptr),%rax + mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 @@ -1753,14 +1817,14 @@ sqr8x_reduction: add %rax,%r15 adc \$0,%rdx add %r15,%r14 - mov 16*0($nptr),%rax # pull n[0] + mov 8*0($nptr),%rax # pull n[0] mov %rdx,%r15 adc \$0,%r15 dec %ecx jnz .L8x_tail - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? jae .L8x_tail_done # break out of loop @@ -1806,7 +1870,7 @@ sqr8x_reduction: adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 adc \$0,%rax # top-most carry - mov -16($nptr),%rcx # np[num-1] + mov -8($nptr),%rcx # np[num-1] xor $carry,$carry movq %xmm2,$nptr # restore $nptr @@ -1833,33 +1897,49 @@ ___ my ($tptr,$nptr)=("%rbx","%rbp"); $code.=<<___; #xor %rsi,%rsi # %rsi was $carry above + mov 8*0($nptr),%r12 sub %r15,%rcx # compare top-most words lea (%rdi,$num),$tptr # %rdi was $tptr above adc %rsi,%rsi mov $num,%rcx or %rsi,%rax movq %xmm1,$rptr # restore $rptr - xor \$1,%rax + neg %rax movq %xmm1,$aptr # prepare for back-to-back call - lea ($nptr,%rax,8),$nptr - sar \$3+2,%rcx # cf=0 - jmp .Lsqr4x_sub + sar \$3+2,%rcx + dec %r12 # so that after 'not' we get -n[0] + xor %r10,%r10 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 + jmp .Lsqr4x_sub_entry -.align 32 +.align 16 .Lsqr4x_sub: - .byte 0x66 - mov 8*0($tptr),%r12 - mov 8*1($tptr),%r13 - sbb 16*0($nptr),%r12 - mov 8*2($tptr),%r14 - sbb 16*1($nptr),%r13 - mov 8*3($tptr),%r15 - lea 8*4($tptr),$tptr - sbb 16*2($nptr),%r14 + mov 8*0($nptr),%r12 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 +.Lsqr4x_sub_entry: + lea 8*4($nptr),$nptr + not %r12 + not %r13 + not %r14 + not %r15 + and %rax,%r12 + and %rax,%r13 + and %rax,%r14 + and %rax,%r15 + + neg %r10 # mov %r10,%cf + adc 8*0($tptr),%r12 + adc 8*1($tptr),%r13 + adc 8*2($tptr),%r14 + adc 8*3($tptr),%r15 mov %r12,8*0($rptr) - sbb 16*3($nptr),%r15 - lea 16*4($nptr),$nptr + lea 8*4($tptr),$tptr mov %r13,8*1($rptr) + sbb %r10,%r10 # mov %cf,%r10 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr @@ -1897,39 +1977,32 @@ bn_from_mont8x: push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - .byte 0x67 - mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). The stack is allocated to aligned with + # bn_power5's frame, and as bn_from_montgomery happens to be + # last operation, we use the opportunity to cleanse it. # - lea -64(%rsp,$num,2),%r11 - sub $aptr,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lfrom_sp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -1983,8 +2056,8 @@ $code.=<<___; ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d - and \$0x80100,%r11d - cmp \$0x80100,%r11d + and \$0x80108,%r11d + cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 jne .Lfrom_mont_nox lea (%rax,$num),$rptr @@ -2039,7 +2112,6 @@ $code.=<<___; .align 32 bn_mulx4x_mont_gather5: .Lmulx4x_enter: - .byte 0x67 mov %rsp,%rax push %rbx push %rbp @@ -2047,40 +2119,33 @@ bn_mulx4x_mont_gather5: push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - .byte 0x67 - mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers a[num], ret[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. [excessive frame is allocated in order - # to allow bn_from_mont8x to clear it.] + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra [num] is allocated in order + # to align with bn_power5's frame, which is cleansed after + # completing exponentiation. Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $ap,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmulx4xsp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) jmp .Lmulx4xsp_done -.align 32 .Lmulx4xsp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -2106,12 +2171,7 @@ $code.=<<___; mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 mov -40(%rsi),%r14 mov -32(%rsi),%r13 @@ -2126,14 +2186,16 @@ $code.=<<___; .type mulx4x_internal,\@abi-omnipotent .align 32 mulx4x_internal: - .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num - .byte 0x67 + mov $num,8(%rsp) # save -$num (it was in bytes) + mov $num,%r10 neg $num # restore $num shl \$5,$num - lea 256($bp,$num),%r13 + neg %r10 # restore $num + lea 128($bp,$num),%r13 # end of powers table (+size optimization) shr \$5+5,$num - mov `($win64?56:8)`(%rax),%r10d # load 7th argument + movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument sub \$1,$num + lea .Linc(%rip),%rax mov %r13,16+8(%rsp) # end of b[num] mov $num,24+8(%rsp) # inner counter mov $rp, 56+8(%rsp) # save $rp @@ -2144,52 +2206,92 @@ my $rptr=$bptr; my $STRIDE=2**5*8; # 5 is "window size" my $N=$STRIDE/4; # should match cache line size $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96($bp,%r11,8),$bptr # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - add \$7,%r11 - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - and \$7,%r11 - - movq `0*$STRIDE/4-96`($bptr),%xmm0 - lea $STRIDE($bptr),$tptr # borrow $tptr - movq `1*$STRIDE/4-96`($bptr),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bptr),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bptr),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - movq `0*$STRIDE/4-96`($tptr),%xmm1 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 - movq `1*$STRIDE/4-96`($tptr),%xmm2 - por %xmm3,%xmm0 - .byte 0x67,0x67 - pand %xmm4,%xmm1 - movq `2*$STRIDE/4-96`($tptr),%xmm3 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton) + lea 128($bp),$bptr # size optimization + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + .byte 0x67 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to index and save result to stack +# +$code.=<<___; + .byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 + movdqa %xmm4,%xmm3 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($i+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + .byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($i+0)+112`(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($i+1)+112`(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($i+2)+112`(%r10) + + pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register + pand `16*($i+1)-128`($bptr),%xmm1 + pand `16*($i+2)-128`($bptr),%xmm2 + movdqa %xmm3,`16*($i+3)+112`(%r10) + pand `16*($i+3)-128`($bptr),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bptr),%xmm4 + movdqa `16*($i+1)-128`($bptr),%xmm5 + movdqa `16*($i+2)-128`($bptr),%xmm2 + pand `16*($i+0)+112`(%r10),%xmm4 + movdqa `16*($i+3)-128`($bptr),%xmm3 + pand `16*($i+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($i+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($i+3)+112`(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + pxor %xmm1,%xmm0 + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + lea $STRIDE($bptr),$bptr movq %xmm0,%rdx # bp[0] - movq `3*$STRIDE/4-96`($tptr),%xmm0 - lea 2*$STRIDE($bptr),$bptr # next &b[i] - pand %xmm5,%xmm2 - .byte 0x67,0x67 - pand %xmm6,%xmm3 - ############################################################## - # $tptr is chosen so that writing to top-most element of the - # vector occurs just "above" references to powers table, - # "above" modulo cache-line size, which effectively precludes - # possibility of memory disambiguation logic failure when - # accessing the table. - # - lea 64+8*4+8(%rsp,%r11,8),$tptr + lea 64+8*4+8(%rsp),$tptr mov %rdx,$bi mulx 0*8($aptr),$mi,%rax # a[0]*b[0] @@ -2205,37 +2307,31 @@ $code.=<<___; xor $zero,$zero # cf=0, of=0 mov $mi,%rdx - por %xmm2,%xmm1 - pand %xmm7,%xmm0 - por %xmm3,%xmm1 mov $bptr,8+8(%rsp) # off-load &b[i] - por %xmm1,%xmm0 - .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr + lea 4*8($aptr),$aptr adcx %rax,%r13 adcx $zero,%r14 # cf=0 - mulx 0*16($nptr),%rax,%r10 + mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 - mulx 1*16($nptr),%rax,%r11 + mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 - mulx 2*16($nptr),%rax,%r12 + mulx 2*8($nptr),%rax,%r12 mov 24+8(%rsp),$bptr # counter value - .byte 0x66 mov %r10,-8*4($tptr) adcx %rax,%r11 adox %r13,%r12 - mulx 3*16($nptr),%rax,%r15 - .byte 0x67,0x67 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-8*3($tptr) adcx %rax,%r12 adox $zero,%r15 # of=0 - .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr mov %r12,-8*2($tptr) - #jmp .Lmulx4x_1st + jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: @@ -2255,30 +2351,29 @@ $code.=<<___; lea 4*8($tptr),$tptr adox %r15,%r10 - mulx 0*16($nptr),%rax,%r15 + mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 - mulx 1*16($nptr),%rax,%r15 + mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 - mulx 2*16($nptr),%rax,%r15 + mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 mov %r11,-4*8($tptr) adox %r15,%r13 - mulx 3*16($nptr),%rax,%r15 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 - lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_1st mov 8(%rsp),$num # load -num - movq %xmm0,%rdx # bp[1] adc $zero,%r15 # modulo-scheduled lea ($aptr,$num),$aptr # rewind $aptr add %r15,%r14 @@ -2289,6 +2384,34 @@ $code.=<<___; .align 32 .Lmulx4x_outer: + lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) + pxor %xmm4,%xmm4 + .byte 0x67,0x67 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bptr),%xmm0 + movdqa `16*($i+1)-128`($bptr),%xmm1 + movdqa `16*($i+2)-128`($bptr),%xmm2 + pand `16*($i+0)+256`(%r10),%xmm0 + movdqa `16*($i+3)-128`($bptr),%xmm3 + pand `16*($i+1)+256`(%r10),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)+256`(%r10),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)+256`(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + lea $STRIDE($bptr),$bptr + movq %xmm0,%rdx # m0=bp[i] + mov $zero,($tptr) # save top-most carry lea 4*8($tptr,$num),$tptr # rewind $tptr mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] @@ -2303,54 +2426,37 @@ $code.=<<___; mulx 3*8($aptr),%rdx,%r14 adox -2*8($tptr),%r12 adcx %rdx,%r13 - lea ($nptr,$num,2),$nptr # rewind $nptr + lea ($nptr,$num),$nptr # rewind $nptr lea 4*8($aptr),$aptr adox -1*8($tptr),%r13 adcx $zero,%r14 adox $zero,%r14 - .byte 0x67 mov $mi,%r15 imulq 32+8(%rsp),$mi # "t[0]"*n0 - movq `0*$STRIDE/4-96`($bptr),%xmm0 - .byte 0x67,0x67 mov $mi,%rdx - movq `1*$STRIDE/4-96`($bptr),%xmm1 - .byte 0x67 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bptr),%xmm2 - .byte 0x67 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bptr),%xmm3 - add \$$STRIDE,$bptr # next &b[i] - .byte 0x67 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 xor $zero,$zero # cf=0, of=0 mov $bptr,8+8(%rsp) # off-load &b[i] - mulx 0*16($nptr),%rax,%r10 + mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 - mulx 1*16($nptr),%rax,%r11 + mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 - mulx 2*16($nptr),%rax,%r12 + mulx 2*8($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 - mulx 3*16($nptr),%rax,%r15 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx - por %xmm2,%xmm0 mov 24+8(%rsp),$bptr # counter value mov %r10,-8*4($tptr) - por %xmm3,%xmm0 adcx %rax,%r12 mov %r11,-8*3($tptr) adox $zero,%r15 # of=0 mov %r12,-8*2($tptr) - lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr jmp .Lmulx4x_inner .align 32 @@ -2375,20 +2481,20 @@ $code.=<<___; adcx $zero,%r14 # cf=0 adox %r15,%r10 - mulx 0*16($nptr),%rax,%r15 + mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 - mulx 1*16($nptr),%rax,%r15 + mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 - mulx 2*16($nptr),%rax,%r15 + mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 adox %r15,%r13 mov %r11,-4*8($tptr) - mulx 3*16($nptr),%rax,%r15 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx - lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 @@ -2398,7 +2504,6 @@ $code.=<<___; jnz .Lmulx4x_inner mov 0+8(%rsp),$num # load -num - movq %xmm0,%rdx # bp[i+1] adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$bptr # pull top-most carry to %cf mov 8+8(%rsp),$bptr # re-load &b[i] @@ -2411,20 +2516,26 @@ $code.=<<___; cmp %r10,$bptr jb .Lmulx4x_outer - mov -16($nptr),%r10 + mov -8($nptr),%r10 + mov $zero,%r8 + mov ($nptr,$num),%r12 + lea ($nptr,$num),%rbp # rewind $nptr + mov $num,%rcx + lea ($tptr,$num),%rdi # rewind $tptr + xor %eax,%eax xor %r15,%r15 sub %r14,%r10 # compare top-most words adc %r15,%r15 - or %r15,$zero - xor \$1,$zero - lea ($tptr,$num),%rdi # rewind $tptr - lea ($nptr,$num,2),$nptr # rewind $nptr - .byte 0x67,0x67 - sar \$3+2,$num # cf=0 - lea ($nptr,$zero,8),%rbp + or %r15,%r8 + sar \$3+2,%rcx + sub %r8,%rax # %rax=-%r8 mov 56+8(%rsp),%rdx # restore rp - mov $num,%rcx - jmp .Lsqrx4x_sub # common post-condition + dec %r12 # so that after 'not' we get -n[0] + mov 8*1(%rbp),%r13 + xor %r8,%r8 + mov 8*2(%rbp),%r14 + mov 8*3(%rbp),%r15 + jmp .Lsqrx4x_sub_entry # common post-condition .size mulx4x_internal,.-mulx4x_internal ___ } { @@ -2448,7 +2559,6 @@ $code.=<<___; .align 32 bn_powerx5: .Lpowerx5_enter: - .byte 0x67 mov %rsp,%rax push %rbx push %rbp @@ -2456,39 +2566,32 @@ bn_powerx5: push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - .byte 0x67 - mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $aptr,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwrx_sp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) jmp .Lpwrx_sp_done .align 32 .Lpwrx_sp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -2534,12 +2637,7 @@ $code.=<<___; mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 mov -40(%rsi),%r14 mov -32(%rsi),%r13 @@ -2977,7 +3075,7 @@ sqrx8x_reduction: xor %eax,%eax # initial top-most carry bit mov 32+8(%rsp),%rbx # n0 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) - lea -128($nptr,$num,2),%rcx # end of n[] + lea -8*8($nptr,$num),%rcx # end of n[] #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer mov %rcx, 0+8(%rsp) # save end of n[] mov $tptr,8+8(%rsp) # save end of t[] @@ -3006,23 +3104,23 @@ sqrx8x_reduction: .align 32 .Lsqrx8x_reduce: mov %r8, %rbx - mulx 16*0($nptr),%rax,%r8 # n[0] + mulx 8*0($nptr),%rax,%r8 # n[0] adcx %rbx,%rax # discarded adox %r9,%r8 - mulx 16*1($nptr),%rbx,%r9 # n[1] + mulx 8*1($nptr),%rbx,%r9 # n[1] adcx %rbx,%r8 adox %r10,%r9 - mulx 16*2($nptr),%rbx,%r10 + mulx 8*2($nptr),%rbx,%r10 adcx %rbx,%r9 adox %r11,%r10 - mulx 16*3($nptr),%rbx,%r11 + mulx 8*3($nptr),%rbx,%r11 adcx %rbx,%r10 adox %r12,%r11 - .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12 + .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 mov %rdx,%rax mov %r8,%rdx adcx %rbx,%r11 @@ -3032,15 +3130,15 @@ sqrx8x_reduction: mov %rax,%rdx mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] - mulx 16*5($nptr),%rax,%r13 + mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 - mulx 16*6($nptr),%rax,%r14 + mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 - mulx 16*7($nptr),%rax,%r15 + mulx 8*7($nptr),%rax,%r15 mov %rbx,%rdx adcx %rax,%r14 adox $carry,%r15 # $carry is 0 @@ -3056,7 +3154,7 @@ sqrx8x_reduction: mov 48+8(%rsp),%rdx # pull n0*a[0] add 8*0($tptr),%r8 - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr mov \$-8,%rcx adcx 8*1($tptr),%r9 adcx 8*2($tptr),%r10 @@ -3075,35 +3173,35 @@ sqrx8x_reduction: .align 32 .Lsqrx8x_tail: mov %r8,%rbx - mulx 16*0($nptr),%rax,%r8 + mulx 8*0($nptr),%rax,%r8 adcx %rax,%rbx adox %r9,%r8 - mulx 16*1($nptr),%rax,%r9 + mulx 8*1($nptr),%rax,%r9 adcx %rax,%r8 adox %r10,%r9 - mulx 16*2($nptr),%rax,%r10 + mulx 8*2($nptr),%rax,%r10 adcx %rax,%r9 adox %r11,%r10 - mulx 16*3($nptr),%rax,%r11 + mulx 8*3($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 - .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12 + .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 - mulx 16*5($nptr),%rax,%r13 + mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 - mulx 16*6($nptr),%rax,%r14 + mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 - mulx 16*7($nptr),%rax,%r15 + mulx 8*7($nptr),%rax,%r15 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] adcx %rax,%r14 adox $carry,%r15 @@ -3119,7 +3217,7 @@ sqrx8x_reduction: sub 16+8(%rsp),$carry # mov 16(%rsp),%cf mov 48+8(%rsp),%rdx # pull n0*a[0] - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 @@ -3155,7 +3253,7 @@ sqrx8x_reduction: adc 8*0($tptr),%r8 movq %xmm3,%rcx adc 8*1($tptr),%r9 - mov 16*7($nptr),$carry + mov 8*7($nptr),$carry movq %xmm2,$nptr # restore $nptr adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 @@ -3191,35 +3289,46 @@ my ($rptr,$nptr)=("%rdx","%rbp"); my @ri=map("%r$_",(10..13)); my @ni=map("%r$_",(14..15)); $code.=<<___; + mov 8*0($nptr),%r12 xor %ebx,%ebx sub %r15,%rsi # compare top-most words adc %rbx,%rbx mov %rcx,%r10 # -$num or %rbx,%rax mov %rcx,%r9 # -$num - xor \$1,%rax - sar \$3+2,%rcx # cf=0 + neg %rax + sar \$3+2,%rcx #lea 48+8(%rsp,%r9),$tptr - lea ($nptr,%rax,8),$nptr movq %xmm1,$rptr # restore $rptr movq %xmm1,$aptr # prepare for back-to-back call - jmp .Lsqrx4x_sub + dec %r12 # so that after 'not' we get -n[0] + mov 8*1($nptr),%r13 + xor %r8,%r8 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 + jmp .Lsqrx4x_sub_entry -.align 32 .Lsqrx4x_sub: - .byte 0x66 - mov 8*0($tptr),%r12 - mov 8*1($tptr),%r13 - sbb 16*0($nptr),%r12 - mov 8*2($tptr),%r14 - sbb 16*1($nptr),%r13 - mov 8*3($tptr),%r15 - lea 8*4($tptr),$tptr - sbb 16*2($nptr),%r14 + mov 8*0($nptr),%r12 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 +.Lsqrx4x_sub_entry: + andn %rax,%r12,%r12 + lea 8*4($nptr),$nptr + andn %rax,%r13,%r13 + andn %rax,%r14,%r14 + andn %rax,%r15,%r15 + + neg %r8 # mov %r8,%cf + adc 8*0($tptr),%r12 + adc 8*1($tptr),%r13 + adc 8*2($tptr),%r14 + adc 8*3($tptr),%r15 mov %r12,8*0($rptr) - sbb 16*3($nptr),%r15 - lea 16*4($nptr),$nptr + lea 8*4($tptr),$tptr mov %r13,8*1($rptr) + sbb %r8,%r8 # mov %cf,%r8 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr @@ -3282,56 +3391,91 @@ bn_scatter5: .globl bn_gather5 .type bn_gather5,\@abi-omnipotent -.align 16 +.align 32 bn_gather5: -___ -$code.=<<___ if ($win64); -.LSEH_begin_bn_gather5: +.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases # I can't trust assembler to use specific encoding:-( - .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp - .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) - .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) + .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 + .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp + lea .Linc(%rip),%rax + and \$-16,%rsp # shouldn't be formally required + + movd $idx,%xmm5 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 128($tbl),%r11 # size optimization + lea 128(%rsp),%rax # size optimization + + pshufd \$0,%xmm5,%xmm5 # broadcast $idx + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 ___ +######################################################################## +# calculate mask by comparing 0..31 to $idx and save result to stack +# +for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; - mov $idx,%r11d - shr \$`log($N/8)/log(2)`,$idx - and \$`$N/8-1`,%r11 - not $idx - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,$idx # 5 is "window size" - lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line - movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which - movq 8(%rax,$idx,8),%xmm5 # cache line contains element - movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument - movq 24(%rax,$idx,8),%xmm7 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 +___ +$code.=<<___ if ($i); + movdqa %xmm3,`16*($i-1)-128`(%rax) +___ +$code.=<<___; + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)-128`(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)-128`(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)-128`(%rax) + movdqa %xmm4,%xmm2 +___ +} +$code.=<<___; + movdqa %xmm3,`16*($i-1)-128`(%rax) jmp .Lgather -.align 16 -.Lgather: - movq `0*$STRIDE/4-128`($tbl),%xmm0 - movq `1*$STRIDE/4-128`($tbl),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-128`($tbl),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-128`($tbl),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - .byte 0x67,0x67 - por %xmm2,%xmm0 - lea $STRIDE($tbl),$tbl - por %xmm3,%xmm0 +.align 32 +.Lgather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`(%r11),%xmm0 + movdqa `16*($i+1)-128`(%r11),%xmm1 + movdqa `16*($i+2)-128`(%r11),%xmm2 + pand `16*($i+0)-128`(%rax),%xmm0 + movdqa `16*($i+3)-128`(%r11),%xmm3 + pand `16*($i+1)-128`(%rax),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)-128`(%rax),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)-128`(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + lea $STRIDE(%r11),%r11 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 movq %xmm0,($out) # m0=bp[0] lea 8($out),$out sub \$1,$num jnz .Lgather -___ -$code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - lea 0x28(%rsp),%rsp -___ -$code.=<<___; + + lea (%r10),%rsp ret .LSEH_end_bn_gather5: .size bn_gather5,.-bn_gather5 @@ -3340,8 +3484,11 @@ ___ $code.=<<___; .align 64 .Lmagic_masks: - .long 0,0, 0,0, 0,0, -1,-1 - .long 0,0, 0,0, 0,0, 0,0 + .long 0x00,0x00,0x01,0x01, 0x08,0x08,0x09,0x09 + .long 0x10,0x10,0x11,0x11, 0x18,0x18,0x19,0x19 +.Linc: + .long 0,0, 1,1 + .long 2,2, 2,2 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by " ___ @@ -3389,19 +3536,23 @@ mul_handler: lea .Lmul_epilogue(%rip),%r10 cmp %r10,%rbx - jb .Lbody_40 + ja .Lbody_40 mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer + + movaps -104(%rax),%xmm0 + movaps -88(%rax),%xmm1 + movaps -72(%rax),%xmm2 + + movups %xmm0,512($context) # restore context->Xmm6 + movups %xmm1,528($context) # restore context->Xmm7 + movups %xmm2,544($context) # restore context->Xmm8 jmp .Lbody_proceed .Lbody_40: mov 40(%rax),%rax # pull saved stack pointer .Lbody_proceed: - - movaps -88(%rax),%xmm0 - movaps -72(%rax),%xmm1 - mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -3414,8 +3565,6 @@ mul_handler: mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 - movups %xmm0,512($context) # restore context->Xmm6 - movups %xmm1,528($context) # restore context->Xmm7 .Lcommon_seh_tail: mov 8(%rax),%rdi @@ -3526,10 +3675,8 @@ ___ $code.=<<___; .align 8 .LSEH_info_bn_gather5: - .byte 0x01,0x0d,0x05,0x00 - .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 - .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 - .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 + .byte 0x01,0x0b,0x02,0x00 + .byte 0x0b,0x01,0x21,0x00 #sub rsp,0x108 .align 8 ___ } diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c index 07f9288bb5..c5e579c77c 100644 --- a/crypto/bn/bn_exp.c +++ b/crypto/bn/bn_exp.c @@ -787,8 +787,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, if (window >= 5) { window = 5; /* ~5% improvement for RSA2048 sign, and even * for RSA4096 */ - if ((top & 7) == 0) - powerbufLen += 2 * top * sizeof(m->d[0]); + /* reserve space for mont->N.d[] copy */ + powerbufLen += top * sizeof(mont->N.d[0]); } #endif (void)0; @@ -1008,7 +1008,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BN_ULONG *not_used, const BN_ULONG *np, const BN_ULONG *n0, int num); - BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2; + BN_ULONG *n0 = mont->n0, *np; /* * BN_to_montgomery can contaminate words above .top [in @@ -1019,11 +1019,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, for (i = tmp.top; i < top; i++) tmp.d[i] = 0; - if (top & 7) - np2 = np; - else - for (np2 = am.d + top, i = 0; i < top; i++) - np2[2 * i] = np[i]; + /* + * copy mont->N.d[] to improve cache locality + */ + for (np = am.d + top, i = 0; i < top; i++) + np[i] = mont->N.d[i]; bn_scatter5(tmp.d, top, powerbuf, 0); bn_scatter5(am.d, am.top, powerbuf, 1); @@ -1033,7 +1033,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, # if 0 for (i = 3; i < 32; i++) { /* Calculate a^i = a^(i-1) * a */ - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); } # else @@ -1044,7 +1044,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, } for (i = 3; i < 8; i += 2) { int j; - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); for (j = 2 * i; j < 32; j *= 2) { bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); @@ -1052,13 +1052,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, } } for (; i < 16; i += 2) { - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); bn_scatter5(tmp.d, top, powerbuf, 2 * i); } for (; i < 32; i += 2) { - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); } # endif @@ -1087,11 +1087,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, while (bits >= 0) { wvalue = bn_get_bits5(p->d, bits - 4); bits -= 5; - bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue); + bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); } } - ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top); + ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top); tmp.top = top; bn_correct_top(&tmp); if (ret) {