summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
56cd71b)
Some OSes, *cough*-dows, insist on stack being "wired" to
physical memory in strictly sequential manner, i.e. if stack
allocation spans two pages, then reference to farmost one can
be punishable by SEGV. But page walking can do good even on
other OSes, because it guarantees that villain thread hits
the guard page before it can make damage to innocent one...
Reviewed-by: Rich Salz <rsalz@openssl.org>
&and ("esp",-64); # align to cache line
&and ("esp",-64); # align to cache line
+ # Some OSes, *cough*-dows, insist on stack being "wired" to
+ # physical memory in strictly sequential manner, i.e. if stack
+ # allocation spans two pages, then reference to farmost one can
+ # be punishable by SEGV. But page walking can do good even on
+ # other OSes, because it guarantees that villain thread hits
+ # the guard page before it can make damage to innocent one...
+ &mov ("eax","ebp");
+ &sub ("eax","esp");
+ &and ("eax",-4096);
+&set_label("page_walk");
+ &mov ("edx",&DWP(0,"esp","eax"));
+ &sub ("eax",4096);
+ &data_byte(0x2e);
+ &jnc (&label("page_walk"));
+
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body:
mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body:
+ # Some OSes, *cough*-dows, insist on stack being "wired" to
+ # physical memory in strictly sequential manner, i.e. if stack
+ # allocation spans two pages, then reference to farmost one can
+ # be punishable by SEGV. But page walking can do good even on
+ # other OSes, because it guarantees that villain thread hits
+ # the guard page before it can make damage to innocent one...
+ sub %rsp,%r11
+ and \$-4096,%r11
+.Lmul_page_walk:
+ mov (%rsp,%r11),%r10
+ sub \$4096,%r11
+ .byte 0x66,0x2e # predict non-taken
+ jnc .Lmul_page_walk
+
mov $bp,%r12 # reassign $bp
___
$bp="%r12";
mov $bp,%r12 # reassign $bp
___
$bp="%r12";
mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul4x_body:
mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul4x_body:
+ sub %rsp,%r11
+ and \$-4096,%r11
+.Lmul4x_page_walk:
+ mov (%rsp,%r11),%r10
+ sub \$4096,%r11
+ .byte 0x2e # predict non-taken
+ jnc .Lmul4x_page_walk
+
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
mov %rdx,%r12 # reassign $bp
___
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
mov %rdx,%r12 # reassign $bp
___
sub %r11,%rsp
.Lsqr8x_sp_done:
and \$-64,%rsp
sub %r11,%rsp
.Lsqr8x_sp_done:
and \$-64,%rsp
+ mov %rax,%r11
+ sub %rsp,%r11
+ and \$-4096,%r11
+.Lsqr8x_page_walk:
+ mov (%rsp,%r11),%r10
+ sub \$4096,%r11
+ .byte 0x2e # predict non-taken
+ jnc .Lsqr8x_page_walk
+
sub $num,%r10 # -$num
mov ($n0),$n0 # *n0
lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
sub $num,%r10 # -$num
mov ($n0),$n0 # *n0
lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
+ mov %rax,%r11
+ sub %rsp,%r11
+ and \$-4096,%r11
+.Lmulx4x_page_walk:
+ mov (%rsp,%r11),%r10
+ sub \$4096,%r11
+ .byte 0x66,0x2e # predict non-taken
+ jnc .Lmulx4x_page_walk
+
+ lea ($bp,$num),%r10
##############################################################
# Stack layout
# +0 num
##############################################################
# Stack layout
# +0 num
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body:
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body:
+ # Some OSes, *cough*-dows, insist on stack being "wired" to
+ # physical memory in strictly sequential manner, i.e. if stack
+ # allocation spans two pages, then reference to farmost one can
+ # be punishable by SEGV. But page walking can do good even on
+ # other OSes, because it guarantees that villain thread hits
+ # the guard page before it can make damage to innocent one...
+ sub %rsp,%rax
+ and \$-4096,%rax
+.Lmul_page_walk:
+ mov (%rsp,%rax),%r11
+ sub \$4096,%rax
+ .byte 0x2e # predict non-taken
+ jnc .Lmul_page_walk
+
lea 128($bp),%r12 # reassign $bp (+size optimization)
___
$bp="%r12";
lea 128($bp),%r12 # reassign $bp (+size optimization)
___
$bp="%r12";
sub %r11,%rsp
.Lmul4xsp_done:
and \$-64,%rsp
sub %r11,%rsp
.Lmul4xsp_done:
and \$-64,%rsp
+ mov %rax,%r11
+ sub %rsp,%r11
+ and \$-4096,%r11
+.Lmul4x_page_walk:
+ mov (%rsp,%r11),%r10
+ sub \$4096,%r11
+ .byte 0x2e # predict non-taken
+ jnc .Lmul4x_page_walk
+
neg $num
mov %rax,40(%rsp)
neg $num
mov %rax,40(%rsp)
sub %r11,%rsp
.Lpwr_sp_done:
and \$-64,%rsp
sub %r11,%rsp
.Lpwr_sp_done:
and \$-64,%rsp
+ mov %rax,%r11
+ sub %rsp,%r11
+ and \$-4096,%r11
+.Lpwr_page_walk:
+ mov (%rsp,%r11),%r10
+ sub \$4096,%r11
+ .byte 0x2e # predict non-taken
+ jnc .Lpwr_page_walk
+
sub %r11,%rsp
.Lfrom_sp_done:
and \$-64,%rsp
sub %r11,%rsp
.Lfrom_sp_done:
and \$-64,%rsp
+ mov %rax,%r11
+ sub %rsp,%r11
+ and \$-4096,%r11
+.Lfrom_page_walk:
+ mov (%rsp,%r11),%r10
+ sub \$4096,%r11
+ .byte 0x2e # predict non-taken
+ jnc .Lfrom_page_walk
+
+ mov $num,%r10
neg $num
##############################################################
neg $num
##############################################################
sub %r11,%rsp
.Lmulx4xsp_done:
and \$-64,%rsp # ensure alignment
sub %r11,%rsp
.Lmulx4xsp_done:
and \$-64,%rsp # ensure alignment
+ mov %rax,%r11
+ sub %rsp,%r11
+ and \$-4096,%r11
+.Lmulx4x_page_walk:
+ mov (%rsp,%r11),%r10
+ sub \$4096,%r11
+ .byte 0x2e # predict non-taken
+ jnc .Lmulx4x_page_walk
+
##############################################################
# Stack layout
# +0 -num
##############################################################
# Stack layout
# +0 -num
sub %r11,%rsp
.Lpwrx_sp_done:
and \$-64,%rsp
sub %r11,%rsp
.Lpwrx_sp_done:
and \$-64,%rsp
+ mov %rax,%r11
+ sub %rsp,%r11
+ and \$-4096,%r11
+.Lpwrx_page_walk:
+ mov (%rsp,%r11),%r10
+ sub \$4096,%r11
+ .byte 0x2e # predict non-taken
+ jnc .Lpwrx_page_walk
+