bn/asm/x86[_64]-mont*.pl: complement alloca with page-walking.
authorAndy Polyakov <appro@openssl.org>
Fri, 4 Mar 2016 10:39:11 +0000 (11:39 +0100)
committerAndy Polyakov <appro@openssl.org>
Mon, 7 Mar 2016 13:58:14 +0000 (14:58 +0100)
Some OSes, *cough*-dows, insist on stack being "wired" to
physical memory in strictly sequential manner, i.e. if stack
allocation spans two pages, then reference to farmost one can
be punishable by SEGV. But page walking can do good even on
other OSes, because it guarantees that villain thread hits
the guard page before it can make damage to innocent one...

Reviewed-by: Rich Salz <rsalz@openssl.org>
crypto/bn/asm/x86-mont.pl
crypto/bn/asm/x86_64-mont.pl
crypto/bn/asm/x86_64-mont5.pl

index e8f6b05..89f4de6 100755 (executable)
@@ -85,6 +85,21 @@ $frame=32;                           # size of above frame rounded up to 16n
 
        &and    ("esp",-64);            # align to cache line
 
+       # Some OSes, *cough*-dows, insist on stack being "wired" to
+       # physical memory in strictly sequential manner, i.e. if stack
+       # allocation spans two pages, then reference to farmost one can
+       # be punishable by SEGV. But page walking can do good even on
+       # other OSes, because it guarantees that villain thread hits
+       # the guard page before it can make damage to innocent one...
+       &mov    ("eax","ebp");
+       &sub    ("eax","esp");
+       &and    ("eax",-4096);
+&set_label("page_walk");
+       &mov    ("edx",&DWP(0,"esp","eax"));
+       &sub    ("eax",4096);
+       &data_byte(0x2e);
+       &jnc    (&label("page_walk"));
+
        ################################# load argument block...
        &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
        &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
index 29ba122..8fb6c99 100755 (executable)
@@ -130,6 +130,20 @@ $code.=<<___;
 
        mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
 .Lmul_body:
+       # Some OSes, *cough*-dows, insist on stack being "wired" to
+       # physical memory in strictly sequential manner, i.e. if stack
+       # allocation spans two pages, then reference to farmost one can
+       # be punishable by SEGV. But page walking can do good even on
+       # other OSes, because it guarantees that villain thread hits
+       # the guard page before it can make damage to innocent one...
+       sub     %rsp,%r11
+       and     \$-4096,%r11
+.Lmul_page_walk:
+       mov     (%rsp,%r11),%r10
+       sub     \$4096,%r11
+       .byte   0x66,0x2e               # predict non-taken
+       jnc     .Lmul_page_walk
+
        mov     $bp,%r12                # reassign $bp
 ___
                $bp="%r12";
@@ -342,6 +356,14 @@ $code.=<<___;
 
        mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
 .Lmul4x_body:
+       sub     %rsp,%r11
+       and     \$-4096,%r11
+.Lmul4x_page_walk:
+       mov     (%rsp,%r11),%r10
+       sub     \$4096,%r11
+       .byte   0x2e                    # predict non-taken
+       jnc     .Lmul4x_page_walk
+
        mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
        mov     %rdx,%r12               # reassign $bp
 ___
@@ -795,6 +817,15 @@ bn_sqr8x_mont:
        sub     %r11,%rsp
 .Lsqr8x_sp_done:
        and     \$-64,%rsp
+       mov     %rax,%r11
+       sub     %rsp,%r11
+       and     \$-4096,%r11
+.Lsqr8x_page_walk:
+       mov     (%rsp,%r11),%r10
+       sub     \$4096,%r11
+       .byte   0x2e                    # predict non-taken
+       jnc     .Lsqr8x_page_walk
+
        mov     $num,%r10
        neg     $num
 
@@ -932,8 +963,17 @@ bn_mulx4x_mont:
        sub     $num,%r10               # -$num
        mov     ($n0),$n0               # *n0
        lea     -72(%rsp,%r10),%rsp     # alloca(frame+$num+8)
-       lea     ($bp,$num),%r10
        and     \$-128,%rsp
+       mov     %rax,%r11
+       sub     %rsp,%r11
+       and     \$-4096,%r11
+.Lmulx4x_page_walk:
+       mov     (%rsp,%r11),%r10
+       sub     \$4096,%r11
+       .byte   0x66,0x2e               # predict non-taken
+       jnc     .Lmulx4x_page_walk
+
+       lea     ($bp,$num),%r10
        ##############################################################
        # Stack layout
        # +0    num
index 2e8c9db..938e170 100755 (executable)
@@ -115,6 +115,20 @@ $code.=<<___;
 
        mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
 .Lmul_body:
+       # Some OSes, *cough*-dows, insist on stack being "wired" to
+       # physical memory in strictly sequential manner, i.e. if stack
+       # allocation spans two pages, then reference to farmost one can
+       # be punishable by SEGV. But page walking can do good even on
+       # other OSes, because it guarantees that villain thread hits
+       # the guard page before it can make damage to innocent one...
+       sub     %rsp,%rax
+       and     \$-4096,%rax
+.Lmul_page_walk:
+       mov     (%rsp,%rax),%r11
+       sub     \$4096,%rax
+       .byte   0x2e                    # predict non-taken
+       jnc     .Lmul_page_walk
+
        lea     128($bp),%r12           # reassign $bp (+size optimization)
 ___
                $bp="%r12";
@@ -469,6 +483,15 @@ $code.=<<___;
        sub     %r11,%rsp
 .Lmul4xsp_done:
        and     \$-64,%rsp
+       mov     %rax,%r11
+       sub     %rsp,%r11
+       and     \$-4096,%r11
+.Lmul4x_page_walk:
+       mov     (%rsp,%r11),%r10
+       sub     \$4096,%r11
+       .byte   0x2e                    # predict non-taken
+       jnc     .Lmul4x_page_walk
+
        neg     $num
 
        mov     %rax,40(%rsp)
@@ -1058,6 +1081,15 @@ $code.=<<___;
        sub     %r11,%rsp
 .Lpwr_sp_done:
        and     \$-64,%rsp
+       mov     %rax,%r11
+       sub     %rsp,%r11
+       and     \$-4096,%r11
+.Lpwr_page_walk:
+       mov     (%rsp,%r11),%r10
+       sub     \$4096,%r11
+       .byte   0x2e                    # predict non-taken
+       jnc     .Lpwr_page_walk
+
        mov     $num,%r10       
        neg     $num
 
@@ -2028,7 +2060,16 @@ bn_from_mont8x:
        sub     %r11,%rsp
 .Lfrom_sp_done:
        and     \$-64,%rsp
-       mov     $num,%r10       
+       mov     %rax,%r11
+       sub     %rsp,%r11
+       and     \$-4096,%r11
+.Lfrom_page_walk:
+       mov     (%rsp,%r11),%r10
+       sub     \$4096,%r11
+       .byte   0x2e                    # predict non-taken
+       jnc     .Lfrom_page_walk
+
+       mov     $num,%r10
        neg     $num
 
        ##############################################################
@@ -2173,6 +2214,15 @@ bn_mulx4x_mont_gather5:
        sub     %r11,%rsp
 .Lmulx4xsp_done:       
        and     \$-64,%rsp              # ensure alignment
+       mov     %rax,%r11
+       sub     %rsp,%r11
+       and     \$-4096,%r11
+.Lmulx4x_page_walk:
+       mov     (%rsp,%r11),%r10
+       sub     \$4096,%r11
+       .byte   0x2e                    # predict non-taken
+       jnc     .Lmulx4x_page_walk
+
        ##############################################################
        # Stack layout
        # +0    -num
@@ -2619,6 +2669,15 @@ bn_powerx5:
        sub     %r11,%rsp
 .Lpwrx_sp_done:
        and     \$-64,%rsp
+       mov     %rax,%r11
+       sub     %rsp,%r11
+       and     \$-4096,%r11
+.Lpwrx_page_walk:
+       mov     (%rsp,%r11),%r10
+       sub     \$4096,%r11
+       .byte   0x2e                    # predict non-taken
+       jnc     .Lpwrx_page_walk
+
        mov     $num,%r10       
        neg     $num