From: Andy Polyakov Date: Wed, 16 Mar 2016 22:33:53 +0000 (+0100) Subject: bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking. X-Git-Tag: OpenSSL_1_1_0~90 X-Git-Url: https://git.openssl.org/?p=openssl.git;a=commitdiff_plain;h=3ba1ef829cf3dd36eaa5e819258d90291c6a1027 bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking. Original strategy for page-walking was adjust stack pointer and then touch pages in order. This kind of asks for double-fault, because if touch fails, then signal will be delivered to frame above adjusted stack pointer. But touching pages prior adjusting stack pointer would upset valgrind. As compromise let's adjust stack pointer in pages, touching top of the stack. This still asks for double-fault, but at least prevents corruption of neighbour stack if allocation is to overstep the guard page. Also omit predict-non-taken hints as they reportedly trigger illegal instructions in some VM setups. Reviewed-by: Richard Levitte --- diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl index 09296ec662..6787503666 100755 --- a/crypto/bn/asm/x86-mont.pl +++ b/crypto/bn/asm/x86-mont.pl @@ -73,27 +73,26 @@ $frame=32; # size of above frame rounded up to 16n &lea ("esi",&wparam(0)); # put aside pointer to argument block &lea ("edx",&wparam(1)); # load ap - &mov ("ebp","esp"); # saved stack pointer! &add ("edi",2); # extra two words on top of tp &neg ("edi"); - &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) + &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2)) &neg ("edi"); # minimize cache contention by arraning 2K window between stack # pointer and ap argument [np is also position sensitive vector, # but it's assumed to be near ap, as it's allocated at ~same # time]. - &mov ("eax","esp"); + &mov ("eax","ebp"); &sub ("eax","edx"); &and ("eax",2047); - &sub ("esp","eax"); # this aligns sp and ap modulo 2048 + &sub ("ebp","eax"); # this aligns sp and ap modulo 2048 - &xor ("edx","esp"); + &xor ("edx","ebp"); &and ("edx",2048); &xor ("edx",2048); - &sub ("esp","edx"); # this splits them apart modulo 4096 + &sub ("ebp","edx"); # this splits them apart modulo 4096 - &and ("esp",-64); # align to cache line + &and ("ebp",-64); # align to cache line # An OS-agnostic version of __chkstk. # @@ -103,20 +102,28 @@ $frame=32; # size of above frame rounded up to 16n # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... - &mov ("eax","ebp"); - &sub ("eax","esp"); + &mov ("eax","esp"); + &sub ("eax","ebp"); &and ("eax",-4096); -&set_label("page_walk"); - &mov ("edx",&DWP(0,"esp","eax")); - &sub ("eax",4096); - &data_byte(0x2e); - &jnc (&label("page_walk")); + &mov ("edx","esp"); # saved stack pointer! + &lea ("esp",&DWP(0,"ebp","eax")); + &mov ("eax",&DWP(0,"esp")); + &cmp ("esp","ebp"); + &ja (&label("page_walk")); + &jmp (&label("page_walk_done")); + +&set_label("page_walk",16); + &lea ("esp",&DWP(-4096,"esp")); + &mov ("eax",&DWP(0,"esp")); + &cmp ("esp","ebp"); + &ja (&label("page_walk")); +&set_label("page_walk_done"); ################################# load argument block... &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp - &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np + &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 #&mov ("edi",&DWP(5*4,"esi"));# int num @@ -124,11 +131,11 @@ $frame=32; # size of above frame rounded up to 16n &mov ($_rp,"eax"); # ... save a copy of argument block &mov ($_ap,"ebx"); &mov ($_bp,"ecx"); - &mov ($_np,"edx"); + &mov ($_np,"ebp"); &mov ($_n0,"esi"); &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling #&mov ($_num,$num); # redundant as $num is not reused - &mov ($_sp,"ebp"); # saved stack pointer! + &mov ($_sp,"edx"); # saved stack pointer! if($sse2) { $acc0="mm0"; # mmx register bank layout diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index 3a2511f7f2..0451fef12e 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -104,6 +104,8 @@ $code=<<___; .type bn_mul_mont,\@function,6 .align 16 bn_mul_mont: + mov ${num}d,${num}d + mov %rsp,%rax test \$3,${num}d jnz .Lmul_enter cmp \$8,${num}d @@ -128,15 +130,12 @@ $code.=<<___; push %r14 push %r15 - mov ${num}d,${num}d - lea 2($num),%r10 + neg $num mov %rsp,%r11 - neg %r10 - lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2)) - and \$-1024,%rsp # minimize TLB usage + lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2)) + neg $num # restore $num + and \$-1024,%r10 # minimize TLB usage - mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp -.Lmul_body: # An OS-agnostic version of __chkstk. # # Some OSes (Windows) insist on stack being "wired" to @@ -145,14 +144,24 @@ $code.=<<___; # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... - sub %rsp,%r11 + sub %r10,%r11 and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.align 16 .Lmul_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x66,0x2e # predict non-taken - jnc .Lmul_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul_body: mov $bp,%r12 # reassign $bp ___ $bp="%r12"; @@ -323,13 +332,13 @@ $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lmul_epilogue: ret .size bn_mul_mont,.-bn_mul_mont @@ -341,6 +350,8 @@ $code.=<<___; .type bn_mul4x_mont,\@function,6 .align 16 bn_mul4x_mont: + mov ${num}d,${num}d + mov %rsp,%rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); @@ -356,23 +367,29 @@ $code.=<<___; push %r14 push %r15 - mov ${num}d,${num}d - lea 4($num),%r10 + neg $num mov %rsp,%r11 - neg %r10 - lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4)) - and \$-1024,%rsp # minimize TLB usage + lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4)) + neg $num # restore + and \$-1024,%r10 # minimize TLB usage - mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp -.Lmul4x_body: - sub %rsp,%r11 + sub %r10,%r11 and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + .Lmul4x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lmul4x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul4x_body: mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp mov %rdx,%r12 # reassign $bp ___ @@ -751,13 +768,13 @@ ___ $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lmul4x_epilogue: ret .size bn_mul4x_mont,.-bn_mul4x_mont @@ -787,14 +804,15 @@ $code.=<<___; .type bn_sqr8x_mont,\@function,6 .align 32 bn_sqr8x_mont: -.Lsqr8x_enter: mov %rsp,%rax +.Lsqr8x_enter: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lsqr8x_prologue: mov ${num}d,%r10d shl \$3,${num}d # convert $num to bytes @@ -807,33 +825,42 @@ bn_sqr8x_mont: # do its job. # lea -64(%rsp,$num,2),%r11 + mov %rsp,%rbp mov ($n0),$n0 # *n0 sub $aptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lsqr8x_sp_alt - sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + sub %r11,%rbp # align with $aptr + lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lsqr8x_sp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lsqr8x_page_walk + jmp .Lsqr8x_page_walk_done + +.align 16 .Lsqr8x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lsqr8x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lsqr8x_page_walk +.Lsqr8x_page_walk_done: mov $num,%r10 neg $num @@ -957,30 +984,38 @@ $code.=<<___; .type bn_mulx4x_mont,\@function,6 .align 32 bn_mulx4x_mont: -.Lmulx4x_enter: mov %rsp,%rax +.Lmulx4x_enter: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes - .byte 0x67 xor %r10,%r10 sub $num,%r10 # -$num mov ($n0),$n0 # *n0 - lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8) - and \$-128,%rsp - mov %rax,%r11 - sub %rsp,%r11 + lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8) + and \$-128,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.align 16 .Lmulx4x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x66,0x2e # predict non-taken - jnc .Lmulx4x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: lea ($bp,$num),%r10 ############################################################## @@ -1341,22 +1376,8 @@ mul_handler: mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer - lea 48(%rax),%rax - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov -40(%rax),%r14 - mov -48(%rax),%r15 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R15 - jmp .Lcommon_seh_tail + jmp .Lcommon_pop_regs .size mul_handler,.-mul_handler .type sqr_handler,\@abi-omnipotent @@ -1384,15 +1405,21 @@ sqr_handler: cmp %r10,%rbx # context->Rip<.Lsqr_body jb .Lcommon_seh_tail + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # body label + cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue + jb .Lcommon_pop_regs + mov 152($context),%rax # pull context->Rsp - mov 4(%r11),%r10d # HandlerData[1] + mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue jae .Lcommon_seh_tail mov 40(%rax),%rax # pull saved stack pointer +.Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -1479,13 +1506,15 @@ $code.=<<___; .LSEH_info_bn_sqr8x_mont: .byte 9,0,0,0 .rva sqr_handler - .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] + .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] +.align 8 ___ $code.=<<___ if ($addx); .LSEH_info_bn_mulx4x_mont: .byte 9,0,0,0 .rva sqr_handler - .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] + .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] +.align 8 ___ } diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 2a7972d610..3278dc6056 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -93,6 +93,8 @@ $code=<<___; .type bn_mul_mont_gather5,\@function,6 .align 64 bn_mul_mont_gather5: + mov ${num}d,${num}d + mov %rsp,%rax test \$7,${num}d jnz .Lmul_enter ___ @@ -104,10 +106,7 @@ $code.=<<___; .align 16 .Lmul_enter: - mov ${num}d,${num}d - mov %rsp,%rax movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument - lea .Linc(%rip),%r10 push %rbx push %rbp push %r12 @@ -115,13 +114,12 @@ $code.=<<___; push %r14 push %r15 - lea 2($num),%r11 - neg %r11 - lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) - and \$-1024,%rsp # minimize TLB usage + neg $num + mov %rsp,%r11 + lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) + neg $num # restore $num + and \$-1024,%r10 # minimize TLB usage - mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp -.Lmul_body: # An OS-agnostic version of __chkstk. # # Some OSes (Windows) insist on stack being "wired" to @@ -130,13 +128,24 @@ $code.=<<___; # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... - sub %rsp,%rax - and \$-4096,%rax + sub %r10,%r11 + and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + .Lmul_page_walk: - mov (%rsp,%rax),%r11 - sub \$4096,%rax - .byte 0x2e # predict non-taken - jnc .Lmul_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + lea .Linc(%rip),%r10 + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul_body: lea 128($bp),%r12 # reassign $bp (+size optimization) ___ @@ -442,6 +451,8 @@ $code.=<<___; .type bn_mul4x_mont_gather5,\@function,6 .align 32 bn_mul4x_mont_gather5: + .byte 0x67 + mov %rsp,%rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); @@ -450,14 +461,13 @@ $code.=<<___ if ($addx); je .Lmulx4x_enter ___ $code.=<<___; - .byte 0x67 - mov %rsp,%rax push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lmul4x_prologue: .byte 0x67 shl \$3,${num}d # convert $num to bytes @@ -474,32 +484,40 @@ $code.=<<___; # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmul4xsp_alt - sub %r11,%rsp # align with $rp - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + sub %r11,%rbp # align with $rp + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lmul4xsp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + .Lmul4x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lmul4x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: neg $num @@ -1043,6 +1061,7 @@ $code.=<<___; .type bn_power5,\@function,6 .align 32 bn_power5: + mov %rsp,%rax ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d @@ -1051,13 +1070,13 @@ $code.=<<___ if ($addx); je .Lpowerx5_enter ___ $code.=<<___; - mov %rsp,%rax push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lpower5_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10d # 3*$num @@ -1072,32 +1091,40 @@ $code.=<<___; # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwr_sp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lpwr_sp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwr_page_walk + jmp .Lpwr_page_walk_done + .Lpwr_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lpwr_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwr_page_walk +.Lpwr_page_walk_done: mov $num,%r10 neg $num @@ -2037,6 +2064,7 @@ bn_from_mont8x: push %r13 push %r14 push %r15 +.Lfrom_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes @@ -2051,32 +2079,40 @@ bn_from_mont8x: # last operation, we use the opportunity to cleanse it. # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lfrom_sp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lfrom_sp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lfrom_page_walk + jmp .Lfrom_page_walk_done + .Lfrom_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lfrom_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lfrom_page_walk +.Lfrom_page_walk_done: mov $num,%r10 neg $num @@ -2182,14 +2218,15 @@ $code.=<<___; .type bn_mulx4x_mont_gather5,\@function,6 .align 32 bn_mulx4x_mont_gather5: -.Lmulx4x_enter: mov %rsp,%rax +.Lmulx4x_enter: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes @@ -2206,31 +2243,39 @@ bn_mulx4x_mont_gather5: # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmulx4xsp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lmulx4xsp_done .Lmulx4xsp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lmulx4xsp_done: - and \$-64,%rsp # ensure alignment - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp # ensure alignment + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + .Lmulx4x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lmulx4x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: ############################################################## # Stack layout @@ -2638,14 +2683,15 @@ $code.=<<___; .type bn_powerx5,\@function,6 .align 32 bn_powerx5: -.Lpowerx5_enter: mov %rsp,%rax +.Lpowerx5_enter: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lpowerx5_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes @@ -2660,32 +2706,40 @@ bn_powerx5: # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwrx_sp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lpwrx_sp_done .align 32 .Lpwrx_sp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lpwrx_sp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + .Lpwrx_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lpwrx_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: mov $num,%r10 neg $num @@ -3616,9 +3670,14 @@ mul_handler: cmp %r10,%rbx # context->RipRip>=epilogue label + jb .Lcommon_pop_regs + mov 152($context),%rax # pull context->Rsp - mov 4(%r11),%r10d # HandlerData[1] + mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail @@ -3630,11 +3689,11 @@ mul_handler: mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer - jmp .Lbody_proceed + jmp .Lcommon_pop_regs .Lbody_40: mov 40(%rax),%rax # pull saved stack pointer -.Lbody_proceed: +.Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -3725,34 +3784,34 @@ $code.=<<___; .LSEH_info_bn_mul_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmul_body,.Lmul_epilogue # HandlerData[] + .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] .align 8 .LSEH_info_bn_mul4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] + .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_power5: .byte 9,0,0,0 .rva mul_handler - .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[] + .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] .align 8 .LSEH_info_bn_from_mont8x: .byte 9,0,0,0 .rva mul_handler - .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] + .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] ___ $code.=<<___ if ($addx); .align 8 .LSEH_info_bn_mulx4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] + .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_powerx5: .byte 9,0,0,0 .rva mul_handler - .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] + .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] ___ $code.=<<___; .align 8