bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.
authorAndy Polyakov <appro@openssl.org>
Wed, 16 Mar 2016 22:33:53 +0000 (23:33 +0100)
committerAndy Polyakov <appro@openssl.org>
Mon, 22 Aug 2016 12:58:32 +0000 (14:58 +0200)
Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Reviewed-by: Richard Levitte <levitte@openssl.org>
crypto/bn/asm/x86-mont.pl
crypto/bn/asm/x86_64-mont.pl
crypto/bn/asm/x86_64-mont5.pl

index 09296ec662d9adca149f7658f2a1f0086f209800..6787503666de94b38b35f2b4526427357f09cdeb 100755 (executable)
@@ -73,27 +73,26 @@ $frame=32;                          # size of above frame rounded up to 16n
 
        &lea    ("esi",&wparam(0));     # put aside pointer to argument block
        &lea    ("edx",&wparam(1));     # load ap
-       &mov    ("ebp","esp");          # saved stack pointer!
        &add    ("edi",2);              # extra two words on top of tp
        &neg    ("edi");
-       &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
+       &lea    ("ebp",&DWP(-$frame,"esp","edi",4));    # future alloca($frame+4*(num+2))
        &neg    ("edi");
 
        # minimize cache contention by arraning 2K window between stack
        # pointer and ap argument [np is also position sensitive vector,
        # but it's assumed to be near ap, as it's allocated at ~same
        # time].
-       &mov    ("eax","esp");
+       &mov    ("eax","ebp");
        &sub    ("eax","edx");
        &and    ("eax",2047);
-       &sub    ("esp","eax");          # this aligns sp and ap modulo 2048
+       &sub    ("ebp","eax");          # this aligns sp and ap modulo 2048
 
-       &xor    ("edx","esp");
+       &xor    ("edx","ebp");
        &and    ("edx",2048);
        &xor    ("edx",2048);
-       &sub    ("esp","edx");          # this splits them apart modulo 4096
+       &sub    ("ebp","edx");          # this splits them apart modulo 4096
 
-       &and    ("esp",-64);            # align to cache line
+       &and    ("ebp",-64);            # align to cache line
 
        # An OS-agnostic version of __chkstk.
        #
@@ -103,20 +102,28 @@ $frame=32;                                # size of above frame rounded up to 16n
        # be punishable by SEGV. But page walking can do good even on
        # other OSes, because it guarantees that villain thread hits
        # the guard page before it can make damage to innocent one...
-       &mov    ("eax","ebp");
-       &sub    ("eax","esp");
+       &mov    ("eax","esp");
+       &sub    ("eax","ebp");
        &and    ("eax",-4096);
-&set_label("page_walk");
-       &mov    ("edx",&DWP(0,"esp","eax"));
-       &sub    ("eax",4096);
-       &data_byte(0x2e);
-       &jnc    (&label("page_walk"));
+       &mov    ("edx","esp");          # saved stack pointer!
+       &lea    ("esp",&DWP(0,"ebp","eax"));
+       &mov    ("eax",&DWP(0,"esp"));
+       &cmp    ("esp","ebp");
+       &ja     (&label("page_walk"));
+       &jmp    (&label("page_walk_done"));
+
+&set_label("page_walk",16);
+       &lea    ("esp",&DWP(-4096,"esp"));
+       &mov    ("eax",&DWP(0,"esp"));
+       &cmp    ("esp","ebp");
+       &ja     (&label("page_walk"));
+&set_label("page_walk_done");
 
        ################################# load argument block...
        &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
        &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
        &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
-       &mov    ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+       &mov    ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
        &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
        #&mov   ("edi",&DWP(5*4,"esi"));# int num
 
@@ -124,11 +131,11 @@ $frame=32;                                # size of above frame rounded up to 16n
        &mov    ($_rp,"eax");           # ... save a copy of argument block
        &mov    ($_ap,"ebx");
        &mov    ($_bp,"ecx");
-       &mov    ($_np,"edx");
+       &mov    ($_np,"ebp");
        &mov    ($_n0,"esi");
        &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
        #&mov   ($_num,$num);           # redundant as $num is not reused
-       &mov    ($_sp,"ebp");           # saved stack pointer!
+       &mov    ($_sp,"edx");           # saved stack pointer!
 \f
 if($sse2) {
 $acc0="mm0";   # mmx register bank layout
index 3a2511f7f2872485154e546e907d2ce148e2284b..0451fef12e952f382adbcec062e67bc18708f5b0 100755 (executable)
@@ -104,6 +104,8 @@ $code=<<___;
 .type  bn_mul_mont,\@function,6
 .align 16
 bn_mul_mont:
+       mov     ${num}d,${num}d
+       mov     %rsp,%rax
        test    \$3,${num}d
        jnz     .Lmul_enter
        cmp     \$8,${num}d
@@ -128,15 +130,12 @@ $code.=<<___;
        push    %r14
        push    %r15
 
-       mov     ${num}d,${num}d
-       lea     2($num),%r10
+       neg     $num
        mov     %rsp,%r11
-       neg     %r10
-       lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+2))
-       and     \$-1024,%rsp            # minimize TLB usage
+       lea     -16(%rsp,$num,8),%r10   # future alloca(8*(num+2))
+       neg     $num                    # restore $num
+       and     \$-1024,%r10            # minimize TLB usage
 
-       mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
-.Lmul_body:
        # An OS-agnostic version of __chkstk.
        #
        # Some OSes (Windows) insist on stack being "wired" to
@@ -145,14 +144,24 @@ $code.=<<___;
        # be punishable by SEGV. But page walking can do good even on
        # other OSes, because it guarantees that villain thread hits
        # the guard page before it can make damage to innocent one...
-       sub     %rsp,%r11
+       sub     %r10,%r11
        and     \$-4096,%r11
+       lea     (%r10,%r11),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul_page_walk
+       jmp     .Lmul_page_walk_done
+
+.align 16
 .Lmul_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x66,0x2e               # predict non-taken
-       jnc     .Lmul_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul_page_walk
+.Lmul_page_walk_done:
 
+       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul_body:
        mov     $bp,%r12                # reassign $bp
 ___
                $bp="%r12";
@@ -323,13 +332,13 @@ $code.=<<___;
 
        mov     8(%rsp,$num,8),%rsi     # restore %rsp
        mov     \$1,%rax
-       mov     (%rsi),%r15
-       mov     8(%rsi),%r14
-       mov     16(%rsi),%r13
-       mov     24(%rsi),%r12
-       mov     32(%rsi),%rbp
-       mov     40(%rsi),%rbx
-       lea     48(%rsi),%rsp
+       mov     -48(%rsi),%r15
+       mov     -40(%rsi),%r14
+       mov     -32(%rsi),%r13
+       mov     -24(%rsi),%r12
+       mov     -16(%rsi),%rbp
+       mov     -8(%rsi),%rbx
+       lea     (%rsi),%rsp
 .Lmul_epilogue:
        ret
 .size  bn_mul_mont,.-bn_mul_mont
@@ -341,6 +350,8 @@ $code.=<<___;
 .type  bn_mul4x_mont,\@function,6
 .align 16
 bn_mul4x_mont:
+       mov     ${num}d,${num}d
+       mov     %rsp,%rax
 .Lmul4x_enter:
 ___
 $code.=<<___ if ($addx);
@@ -356,23 +367,29 @@ $code.=<<___;
        push    %r14
        push    %r15
 
-       mov     ${num}d,${num}d
-       lea     4($num),%r10
+       neg     $num
        mov     %rsp,%r11
-       neg     %r10
-       lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+4))
-       and     \$-1024,%rsp            # minimize TLB usage
+       lea     -32(%rsp,$num,8),%r10   # future alloca(8*(num+4))
+       neg     $num                    # restore
+       and     \$-1024,%r10            # minimize TLB usage
 
-       mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
-.Lmul4x_body:
-       sub     %rsp,%r11
+       sub     %r10,%r11
        and     \$-4096,%r11
+       lea     (%r10,%r11),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul4x_page_walk
+       jmp     .Lmul4x_page_walk_done
+
 .Lmul4x_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lmul4x_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
 
+       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul4x_body:
        mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
        mov     %rdx,%r12               # reassign $bp
 ___
@@ -751,13 +768,13 @@ ___
 $code.=<<___;
        mov     8(%rsp,$num,8),%rsi     # restore %rsp
        mov     \$1,%rax
-       mov     (%rsi),%r15
-       mov     8(%rsi),%r14
-       mov     16(%rsi),%r13
-       mov     24(%rsi),%r12
-       mov     32(%rsi),%rbp
-       mov     40(%rsi),%rbx
-       lea     48(%rsi),%rsp
+       mov     -48(%rsi),%r15
+       mov     -40(%rsi),%r14
+       mov     -32(%rsi),%r13
+       mov     -24(%rsi),%r12
+       mov     -16(%rsi),%rbp
+       mov     -8(%rsi),%rbx
+       lea     (%rsi),%rsp
 .Lmul4x_epilogue:
        ret
 .size  bn_mul4x_mont,.-bn_mul4x_mont
@@ -787,14 +804,15 @@ $code.=<<___;
 .type  bn_sqr8x_mont,\@function,6
 .align 32
 bn_sqr8x_mont:
-.Lsqr8x_enter:
        mov     %rsp,%rax
+.Lsqr8x_enter:
        push    %rbx
        push    %rbp
        push    %r12
        push    %r13
        push    %r14
        push    %r15
+.Lsqr8x_prologue:
 
        mov     ${num}d,%r10d
        shl     \$3,${num}d             # convert $num to bytes
@@ -807,33 +825,42 @@ bn_sqr8x_mont:
        # do its job.
        #
        lea     -64(%rsp,$num,2),%r11
+       mov     %rsp,%rbp
        mov     ($n0),$n0               # *n0
        sub     $aptr,%r11
        and     \$4095,%r11
        cmp     %r11,%r10
        jb      .Lsqr8x_sp_alt
-       sub     %r11,%rsp               # align with $aptr
-       lea     -64(%rsp,$num,2),%rsp   # alloca(frame+2*$num)
+       sub     %r11,%rbp               # align with $aptr
+       lea     -64(%rbp,$num,2),%rbp   # future alloca(frame+2*$num)
        jmp     .Lsqr8x_sp_done
 
 .align 32
 .Lsqr8x_sp_alt:
        lea     4096-64(,$num,2),%r10   # 4096-frame-2*$num
-       lea     -64(%rsp,$num,2),%rsp   # alloca(frame+2*$num)
+       lea     -64(%rbp,$num,2),%rbp   # future alloca(frame+2*$num)
        sub     %r10,%r11
        mov     \$0,%r10
        cmovc   %r10,%r11
-       sub     %r11,%rsp
+       sub     %r11,%rbp
 .Lsqr8x_sp_done:
-       and     \$-64,%rsp
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       and     \$-64,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lsqr8x_page_walk
+       jmp     .Lsqr8x_page_walk_done
+
+.align 16
 .Lsqr8x_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lsqr8x_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
 
        mov     $num,%r10
        neg     $num
@@ -957,30 +984,38 @@ $code.=<<___;
 .type  bn_mulx4x_mont,\@function,6
 .align 32
 bn_mulx4x_mont:
-.Lmulx4x_enter:
        mov     %rsp,%rax
+.Lmulx4x_enter:
        push    %rbx
        push    %rbp
        push    %r12
        push    %r13
        push    %r14
        push    %r15
+.Lmulx4x_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
-       .byte   0x67
        xor     %r10,%r10
        sub     $num,%r10               # -$num
        mov     ($n0),$n0               # *n0
-       lea     -72(%rsp,%r10),%rsp     # alloca(frame+$num+8)
-       and     \$-128,%rsp
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       lea     -72(%rsp,%r10),%rbp     # future alloca(frame+$num+8)
+       and     \$-128,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmulx4x_page_walk
+       jmp     .Lmulx4x_page_walk_done
+
+.align 16
 .Lmulx4x_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x66,0x2e               # predict non-taken
-       jnc     .Lmulx4x_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
 
        lea     ($bp,$num),%r10
        ##############################################################
@@ -1341,22 +1376,8 @@ mul_handler:
 
        mov     192($context),%r10      # pull $num
        mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
-       lea     48(%rax),%rax
-
-       mov     -8(%rax),%rbx
-       mov     -16(%rax),%rbp
-       mov     -24(%rax),%r12
-       mov     -32(%rax),%r13
-       mov     -40(%rax),%r14
-       mov     -48(%rax),%r15
-       mov     %rbx,144($context)      # restore context->Rbx
-       mov     %rbp,160($context)      # restore context->Rbp
-       mov     %r12,216($context)      # restore context->R12
-       mov     %r13,224($context)      # restore context->R13
-       mov     %r14,232($context)      # restore context->R14
-       mov     %r15,240($context)      # restore context->R15
 
-       jmp     .Lcommon_seh_tail
+       jmp     .Lcommon_pop_regs
 .size  mul_handler,.-mul_handler
 
 .type  sqr_handler,\@abi-omnipotent
@@ -1384,15 +1405,21 @@ sqr_handler:
        cmp     %r10,%rbx               # context->Rip<.Lsqr_body
        jb      .Lcommon_seh_tail
 
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # body label
+       cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
+       jb      .Lcommon_pop_regs
+
        mov     152($context),%rax      # pull context->Rsp
 
-       mov     4(%r11),%r10d           # HandlerData[1]
+       mov     8(%r11),%r10d           # HandlerData[2]
        lea     (%rsi,%r10),%r10        # epilogue label
        cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
        jae     .Lcommon_seh_tail
 
        mov     40(%rax),%rax           # pull saved stack pointer
 
+.Lcommon_pop_regs:
        mov     -8(%rax),%rbx
        mov     -16(%rax),%rbp
        mov     -24(%rax),%r12
@@ -1479,13 +1506,15 @@ $code.=<<___;
 .LSEH_info_bn_sqr8x_mont:
        .byte   9,0,0,0
        .rva    sqr_handler
-       .rva    .Lsqr8x_body,.Lsqr8x_epilogue   # HandlerData[]
+       .rva    .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue          # HandlerData[]
+.align 8
 ___
 $code.=<<___ if ($addx);
 .LSEH_info_bn_mulx4x_mont:
        .byte   9,0,0,0
        .rva    sqr_handler
-       .rva    .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+       .rva    .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue       # HandlerData[]
+.align 8
 ___
 }
 
index 2a7972d61086c87515a30a4bd6b19f51d8ce5534..3278dc60568ba4cd04dc473bf536083815095249 100755 (executable)
@@ -93,6 +93,8 @@ $code=<<___;
 .type  bn_mul_mont_gather5,\@function,6
 .align 64
 bn_mul_mont_gather5:
+       mov     ${num}d,${num}d
+       mov     %rsp,%rax
        test    \$7,${num}d
        jnz     .Lmul_enter
 ___
@@ -104,10 +106,7 @@ $code.=<<___;
 
 .align 16
 .Lmul_enter:
-       mov     ${num}d,${num}d
-       mov     %rsp,%rax
        movd    `($win64?56:8)`(%rsp),%xmm5     # load 7th argument
-       lea     .Linc(%rip),%r10
        push    %rbx
        push    %rbp
        push    %r12
@@ -115,13 +114,12 @@ $code.=<<___;
        push    %r14
        push    %r15
 
-       lea     2($num),%r11
-       neg     %r11
-       lea     -264(%rsp,%r11,8),%rsp  # tp=alloca(8*(num+2)+256+8)
-       and     \$-1024,%rsp            # minimize TLB usage
+       neg     $num
+       mov     %rsp,%r11
+       lea     -280(%rsp,$num,8),%r10  # future alloca(8*(num+2)+256+8)
+       neg     $num                    # restore $num
+       and     \$-1024,%r10            # minimize TLB usage
 
-       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
-.Lmul_body:
        # An OS-agnostic version of __chkstk.
        #
        # Some OSes (Windows) insist on stack being "wired" to
@@ -130,13 +128,24 @@ $code.=<<___;
        # be punishable by SEGV. But page walking can do good even on
        # other OSes, because it guarantees that villain thread hits
        # the guard page before it can make damage to innocent one...
-       sub     %rsp,%rax
-       and     \$-4096,%rax
+       sub     %r10,%r11
+       and     \$-4096,%r11
+       lea     (%r10,%r11),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul_page_walk
+       jmp     .Lmul_page_walk_done
+
 .Lmul_page_walk:
-       mov     (%rsp,%rax),%r11
-       sub     \$4096,%rax
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lmul_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul_page_walk
+.Lmul_page_walk_done:
+
+       lea     .Linc(%rip),%r10
+       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul_body:
 
        lea     128($bp),%r12           # reassign $bp (+size optimization)
 ___
@@ -442,6 +451,8 @@ $code.=<<___;
 .type  bn_mul4x_mont_gather5,\@function,6
 .align 32
 bn_mul4x_mont_gather5:
+       .byte   0x67
+       mov     %rsp,%rax
 .Lmul4x_enter:
 ___
 $code.=<<___ if ($addx);
@@ -450,14 +461,13 @@ $code.=<<___ if ($addx);
        je      .Lmulx4x_enter
 ___
 $code.=<<___;
-       .byte   0x67
-       mov     %rsp,%rax
        push    %rbx
        push    %rbp
        push    %r12
        push    %r13
        push    %r14
        push    %r15
+.Lmul4x_prologue:
 
        .byte   0x67
        shl     \$3,${num}d             # convert $num to bytes
@@ -474,32 +484,40 @@ $code.=<<___;
        # calculated from 7th argument, the index.]
        #
        lea     -320(%rsp,$num,2),%r11
+       mov     %rsp,%rbp
        sub     $rp,%r11
        and     \$4095,%r11
        cmp     %r11,%r10
        jb      .Lmul4xsp_alt
-       sub     %r11,%rsp               # align with $rp
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*num*8+256)
+       sub     %r11,%rbp               # align with $rp
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*num*8+256)
        jmp     .Lmul4xsp_done
 
 .align 32
 .Lmul4xsp_alt:
        lea     4096-320(,$num,2),%r10
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*num*8+256)
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*num*8+256)
        sub     %r10,%r11
        mov     \$0,%r10
        cmovc   %r10,%r11
-       sub     %r11,%rsp
+       sub     %r11,%rbp
 .Lmul4xsp_done:
-       and     \$-64,%rsp
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       and     \$-64,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmul4x_page_walk
+       jmp     .Lmul4x_page_walk_done
+
 .Lmul4x_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lmul4x_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
 
        neg     $num
 
@@ -1043,6 +1061,7 @@ $code.=<<___;
 .type  bn_power5,\@function,6
 .align 32
 bn_power5:
+       mov     %rsp,%rax
 ___
 $code.=<<___ if ($addx);
        mov     OPENSSL_ia32cap_P+8(%rip),%r11d
@@ -1051,13 +1070,13 @@ $code.=<<___ if ($addx);
        je      .Lpowerx5_enter
 ___
 $code.=<<___;
-       mov     %rsp,%rax
        push    %rbx
        push    %rbp
        push    %r12
        push    %r13
        push    %r14
        push    %r15
+.Lpower5_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
        lea     ($num,$num,2),%r10d     # 3*$num
@@ -1072,32 +1091,40 @@ $code.=<<___;
        # calculated from 7th argument, the index.]
        #
        lea     -320(%rsp,$num,2),%r11
+       mov     %rsp,%rbp
        sub     $rptr,%r11
        and     \$4095,%r11
        cmp     %r11,%r10
        jb      .Lpwr_sp_alt
-       sub     %r11,%rsp               # align with $aptr
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*num*8+256)
+       sub     %r11,%rbp               # align with $aptr
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*num*8+256)
        jmp     .Lpwr_sp_done
 
 .align 32
 .Lpwr_sp_alt:
        lea     4096-320(,$num,2),%r10
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*num*8+256)
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*num*8+256)
        sub     %r10,%r11
        mov     \$0,%r10
        cmovc   %r10,%r11
-       sub     %r11,%rsp
+       sub     %r11,%rbp
 .Lpwr_sp_done:
-       and     \$-64,%rsp
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       and     \$-64,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lpwr_page_walk
+       jmp     .Lpwr_page_walk_done
+
 .Lpwr_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lpwr_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lpwr_page_walk
+.Lpwr_page_walk_done:
 
        mov     $num,%r10       
        neg     $num
@@ -2037,6 +2064,7 @@ bn_from_mont8x:
        push    %r13
        push    %r14
        push    %r15
+.Lfrom_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
        lea     ($num,$num,2),%r10      # 3*$num in bytes
@@ -2051,32 +2079,40 @@ bn_from_mont8x:
        # last operation, we use the opportunity to cleanse it.
        #
        lea     -320(%rsp,$num,2),%r11
+       mov     %rsp,%rbp
        sub     $rptr,%r11
        and     \$4095,%r11
        cmp     %r11,%r10
        jb      .Lfrom_sp_alt
-       sub     %r11,%rsp               # align with $aptr
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*$num*8+256)
+       sub     %r11,%rbp               # align with $aptr
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
        jmp     .Lfrom_sp_done
 
 .align 32
 .Lfrom_sp_alt:
        lea     4096-320(,$num,2),%r10
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*$num*8+256)
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
        sub     %r10,%r11
        mov     \$0,%r10
        cmovc   %r10,%r11
-       sub     %r11,%rsp
+       sub     %r11,%rbp
 .Lfrom_sp_done:
-       and     \$-64,%rsp
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       and     \$-64,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lfrom_page_walk
+       jmp     .Lfrom_page_walk_done
+
 .Lfrom_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lfrom_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lfrom_page_walk
+.Lfrom_page_walk_done:
 
        mov     $num,%r10
        neg     $num
@@ -2182,14 +2218,15 @@ $code.=<<___;
 .type  bn_mulx4x_mont_gather5,\@function,6
 .align 32
 bn_mulx4x_mont_gather5:
-.Lmulx4x_enter:
        mov     %rsp,%rax
+.Lmulx4x_enter:
        push    %rbx
        push    %rbp
        push    %r12
        push    %r13
        push    %r14
        push    %r15
+.Lmulx4x_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
        lea     ($num,$num,2),%r10      # 3*$num in bytes
@@ -2206,31 +2243,39 @@ bn_mulx4x_mont_gather5:
        # calculated from 7th argument, the index.]
        #
        lea     -320(%rsp,$num,2),%r11
+       mov     %rsp,%rbp
        sub     $rp,%r11
        and     \$4095,%r11
        cmp     %r11,%r10
        jb      .Lmulx4xsp_alt
-       sub     %r11,%rsp               # align with $aptr
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*$num*8+256)
+       sub     %r11,%rbp               # align with $aptr
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
        jmp     .Lmulx4xsp_done
 
 .Lmulx4xsp_alt:
        lea     4096-320(,$num,2),%r10
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*$num*8+256)
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
        sub     %r10,%r11
        mov     \$0,%r10
        cmovc   %r10,%r11
-       sub     %r11,%rsp
+       sub     %r11,%rbp
 .Lmulx4xsp_done:       
-       and     \$-64,%rsp              # ensure alignment
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       and     \$-64,%rbp              # ensure alignment
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmulx4x_page_walk
+       jmp     .Lmulx4x_page_walk_done
+
 .Lmulx4x_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lmulx4x_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
 
        ##############################################################
        # Stack layout
@@ -2638,14 +2683,15 @@ $code.=<<___;
 .type  bn_powerx5,\@function,6
 .align 32
 bn_powerx5:
-.Lpowerx5_enter:
        mov     %rsp,%rax
+.Lpowerx5_enter:
        push    %rbx
        push    %rbp
        push    %r12
        push    %r13
        push    %r14
        push    %r15
+.Lpowerx5_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
        lea     ($num,$num,2),%r10      # 3*$num in bytes
@@ -2660,32 +2706,40 @@ bn_powerx5:
        # calculated from 7th argument, the index.]
        #
        lea     -320(%rsp,$num,2),%r11
+       mov     %rsp,%rbp
        sub     $rptr,%r11
        and     \$4095,%r11
        cmp     %r11,%r10
        jb      .Lpwrx_sp_alt
-       sub     %r11,%rsp               # align with $aptr
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*$num*8+256)
+       sub     %r11,%rbp               # align with $aptr
+       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
        jmp     .Lpwrx_sp_done
 
 .align 32
 .Lpwrx_sp_alt:
        lea     4096-320(,$num,2),%r10
-       lea     -320(%rsp,$num,2),%rsp  # alloca(frame+2*$num*8+256)
+       lea     -320(%rbp,$num,2),%rbp  # alloca(frame+2*$num*8+256)
        sub     %r10,%r11
        mov     \$0,%r10
        cmovc   %r10,%r11
-       sub     %r11,%rsp
+       sub     %r11,%rbp
 .Lpwrx_sp_done:
-       and     \$-64,%rsp
-       mov     %rax,%r11
-       sub     %rsp,%r11
+       and     \$-64,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
        and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lpwrx_page_walk
+       jmp     .Lpwrx_page_walk_done
+
 .Lpwrx_page_walk:
-       mov     (%rsp,%r11),%r10
-       sub     \$4096,%r11
-       .byte   0x2e                    # predict non-taken
-       jnc     .Lpwrx_page_walk
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lpwrx_page_walk
+.Lpwrx_page_walk_done:
 
        mov     $num,%r10       
        neg     $num
@@ -3616,9 +3670,14 @@ mul_handler:
        cmp     %r10,%rbx               # context->Rip<end of prologue label
        jb      .Lcommon_seh_tail
 
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jb      .Lcommon_pop_regs
+
        mov     152($context),%rax      # pull context->Rsp
 
-       mov     4(%r11),%r10d           # HandlerData[1]
+       mov     8(%r11),%r10d           # HandlerData[2]
        lea     (%rsi,%r10),%r10        # epilogue label
        cmp     %r10,%rbx               # context->Rip>=epilogue label
        jae     .Lcommon_seh_tail
@@ -3630,11 +3689,11 @@ mul_handler:
        mov     192($context),%r10      # pull $num
        mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
 
-       jmp     .Lbody_proceed
+       jmp     .Lcommon_pop_regs
 
 .Lbody_40:
        mov     40(%rax),%rax           # pull saved stack pointer
-.Lbody_proceed:
+.Lcommon_pop_regs:
        mov     -8(%rax),%rbx
        mov     -16(%rax),%rbp
        mov     -24(%rax),%r12
@@ -3725,34 +3784,34 @@ $code.=<<___;
 .LSEH_info_bn_mul_mont_gather5:
        .byte   9,0,0,0
        .rva    mul_handler
-       .rva    .Lmul_body,.Lmul_epilogue               # HandlerData[]
+       .rva    .Lmul_body,.Lmul_body,.Lmul_epilogue            # HandlerData[]
 .align 8
 .LSEH_info_bn_mul4x_mont_gather5:
        .byte   9,0,0,0
        .rva    mul_handler
-       .rva    .Lmul4x_body,.Lmul4x_epilogue           # HandlerData[]
+       .rva    .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue          # HandlerData[]
 .align 8
 .LSEH_info_bn_power5:
        .byte   9,0,0,0
        .rva    mul_handler
-       .rva    .Lpower5_body,.Lpower5_epilogue         # HandlerData[]
+       .rva    .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue       # HandlerData[]
 .align 8
 .LSEH_info_bn_from_mont8x:
        .byte   9,0,0,0
        .rva    mul_handler
-       .rva    .Lfrom_body,.Lfrom_epilogue             # HandlerData[]
+       .rva    .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue             # HandlerData[]
 ___
 $code.=<<___ if ($addx);
 .align 8
 .LSEH_info_bn_mulx4x_mont_gather5:
        .byte   9,0,0,0
        .rva    mul_handler
-       .rva    .Lmulx4x_body,.Lmulx4x_epilogue         # HandlerData[]
+       .rva    .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue       # HandlerData[]
 .align 8
 .LSEH_info_bn_powerx5:
        .byte   9,0,0,0
        .rva    mul_handler
-       .rva    .Lpowerx5_body,.Lpowerx5_epilogue       # HandlerData[]
+       .rva    .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue    # HandlerData[]
 ___
 $code.=<<___;
 .align 8