blob: 39454c9c5747212fd1f59f9991e0e44f675260a5 [file] [log] [blame]
/* **********************************************************
* Copyright (c) 2010-2014 Google, Inc. All rights reserved.
* Copyright (c) 2002-2010 VMware, Inc. All rights reserved.
* **********************************************************/
/*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of VMware, Inc. nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
/* Copyright (c) 2003-2007 Determina Corp. */
/* Copyright (c) 2002-2003 Massachusetts Institute of Technology */
/* Copyright (c) 2002 Hewlett-Packard Company */
/*
* callback.c - windows-specific callback, APC, and exception handling routines
*/
/* This whole file assumes x86 */
#include "configure.h"
#ifndef X86
#error X86 must be defined
#endif
#include "../globals.h"
#include "arch.h"
#include "instr.h"
#include "decode.h"
#include "../monitor.h"
#include "../fcache.h"
#include "../fragment.h"
#include "decode_fast.h"
#include "disassemble.h"
#include "instr_create.h"
#include "ntdll.h"
#include "events.h"
#include "os_private.h"
#include "../moduledb.h"
#include "aslr.h"
#include "../nudge.h" /* for generic_nudge_target() address */
#ifdef RETURN_AFTER_CALL
# include "../rct.h" /* for rct_ind_branch_target_lookup */
#endif
#include "instrument.h"
#include "../perscache.h"
#include "../translate.h"
#include <string.h> /* for memcpy */
#include <windows.h>
/* forward declarations */
static dcontext_t * callback_setup(app_pc next_pc);
static byte * insert_image_entry_trampoline(dcontext_t *dcontext);
static void swap_dcontexts(dcontext_t *done, dcontext_t *dtwo);
static void asynch_take_over(app_state_at_intercept_t *state);
/* currently we do not intercept top level exceptions */
#ifdef INTERCEPT_TOP_LEVEL_EXCEPTIONS
/* the app's top-level exception handler */
static LPTOP_LEVEL_EXCEPTION_FILTER app_top_handler;
#endif
/* All of our hooks use landing pads to then indirectly target
* this interception code, which in turn assumes it can directly
* reach our hook targets in the DR lib. Thus, we want this
* interception buffer to not be in vmcode nor vmheap, but near the
* DR lib: which is simplest with a static array.
* We write-protect this, so we don't need the ASLR of our heap.
*/
ALIGN_VAR(4096) static byte interception_code_array[INTERCEPTION_CODE_SIZE];
/* interception information
* if it weren't for syscall trampolines this could be a single page
* Note: if you add more intercept points, make sure to adjust
* NUM_INTERCEPT_POINTS below.
*/
static byte * interception_code = NULL;
static byte * interception_cur_pc = NULL;
static byte * ldr_init_pc = NULL;
static byte * callback_pc = NULL;
static byte * apc_pc = NULL;
static byte * exception_pc = NULL;
static byte * raise_exception_pc = NULL;
static byte * after_callback_orig_pc = NULL;
static byte * after_apc_orig_pc = NULL;
static byte * load_dll_pc = NULL;
static byte * unload_dll_pc = NULL;
static byte * image_entry_pc = NULL;
static byte * image_entry_trampoline = NULL;
static byte * syscall_trampolines_start = NULL;
static byte * syscall_trampolines_end = NULL;
/* We rely on the compiler doing the right thing
so when we dereference an imported function we get its real address
instead of a stub in our module. The loader does the rest of the magic.
*/
GET_NTDLL(KiUserApcDispatcher, (IN PVOID Unknown1,
IN PVOID Unknown2,
IN PVOID Unknown3,
IN PVOID ContextStart,
IN PVOID ContextBody));
GET_NTDLL(KiUserCallbackDispatcher, (IN PVOID Unknown1,
IN PVOID Unknown2,
IN PVOID Unknown3));
GET_NTDLL(KiUserExceptionDispatcher, (IN PVOID Unknown1,
IN PVOID Unknown2));
GET_NTDLL(KiRaiseUserExceptionDispatcher, (void));
/* generated routine for taking over native threads */
byte *thread_attach_takeover;
static byte *
emit_takeover_code(byte *pc);
/* For detach */
bool init_apc_go_native = false;
bool init_apc_go_native_pause = false;
/* overridden by dr_preinjected, or retakeover_after_native() */
static retakeover_point_t interception_point = INTERCEPT_PREINJECT;
/* While emiting the trampoline, the alt. target is unknown for hotp_only. */
#define CURRENTLY_UNKNOWN ((byte *)(ptr_uint_t) 0xdeadc0de)
#ifdef DEBUG
#define INTERCEPT_POINT(point) STRINGIFY(point),
static const char * const retakeover_names[] = {
INTERCEPT_ALL_POINTS
};
#undef INTERCEPT_POINT
#endif
/* We keep a list of mappings from intercept points to original app PCs */
typedef struct _intercept_map_elem_t {
byte *interception_pc;
app_pc original_app_pc;
size_t displace_length; /* includes jmp back */
size_t orig_length;
struct _intercept_map_elem_t *next;
} intercept_map_elem_t;
typedef struct _intercept_map_t {
intercept_map_elem_t *head;
intercept_map_elem_t *tail;
} intercept_map_t;
static intercept_map_t *intercept_map;
DECLARE_CXTSWPROT_VAR(static mutex_t map_intercept_pc_lock,
INIT_LOCK_FREE(map_intercept_pc_lock));
DECLARE_CXTSWPROT_VAR(static mutex_t emulate_write_lock,
INIT_LOCK_FREE(emulate_write_lock));
#ifdef STACK_GUARD_PAGE
DECLARE_CXTSWPROT_VAR(static mutex_t exception_stack_lock,
INIT_LOCK_FREE(exception_stack_lock));
#endif
DECLARE_CXTSWPROT_VAR(static mutex_t intercept_hook_lock,
INIT_LOCK_FREE(intercept_hook_lock));
/* Only used for Vista, new threads start directly here instead of going
* through KiUserApcDispatcher first. Isn't in our lib (though is exported
* on 2k, xp and vista at least) so we get it dynamically. */
static byte *LdrInitializeThunk = NULL;
/* On vista this is the address the kernel sets (via NtCreateThreadEx, used by all the
* api routines) as Xip in the context the LdrInitializeThunk NtContinue's to (is eqv.
* to the unexported kernel32!Base[Process,Thread]StartThunk in pre-Vista). Fortunately
* ntdll!RtlUserThreadStart is exported and we cache it here for use in
* intercept_new_thread(). Note that threads created by the legacy native
* NtCreateThread don't have to target this address. */
static byte *RtlUserThreadStart = NULL;
#ifndef X64
/* Used to create a clean syscall wrapper on win8 where there's no ind call */
static byte *KiFastSystemCall = NULL;
#endif
/* i#1443: we need to identify threads queued up waiting for DR init.
* We can't use heap of course so we have to use a max count.
* We've never seen more than one at a time.
*/
#define MAX_THREADS_WAITING_FOR_DR_INIT 8
/* We assume INVALID_THREAD_ID is 0 (checked in callback_init()). */
/* These need to be neverprot for use w/ new threads. The risk is small. */
DECLARE_NEVERPROT_VAR(static thread_id_t threads_waiting_for_dr_init
[MAX_THREADS_WAITING_FOR_DR_INIT], {0});
/* This is also the next index+1 into the array to write to, incremented atomically. */
DECLARE_NEVERPROT_VAR(static uint threads_waiting_count, 0);
static inline app_pc
get_setcontext_interceptor()
{
return (app_pc) nt_continue_dynamo_start;
}
/* if tid != self, must hold thread_initexit_lock */
void
set_asynch_interception(thread_id_t tid, bool intercept)
{
/* Needed to turn on and off asynchronous event interception
* for non-entire-application-under-dynamo-control situations
*/
thread_record_t *tr = thread_lookup(tid);
ASSERT(tr != NULL);
tr->under_dynamo_control = intercept;
}
static inline bool
intercept_asynch_global()
{
return (intercept_asynch && !INTERNAL_OPTION(nullcalls));
}
/* if tr is not for calling thread, must hold thread_initexit_lock */
static bool
intercept_asynch_common(thread_record_t *tr, bool intercept_unknown)
{
if (!intercept_asynch_global())
return false;
if (tr == NULL) {
if (intercept_unknown)
return true;
/* caller should have made all attempts to get tr */
if (control_all_threads) {
/* we should know about all threads! */
SYSLOG_INTERNAL_WARNING("Received asynch event for unknown thread "TIDFMT"", get_thread_id());
/* try to make everything run rather than assert -- just do
* this asynch natively, we probably received it for a thread that's
* been created but not scheduled?
*/
}
return false;
}
/* FIXME: under_dynamo_control should be an enum w/ separate
* values for 1) truly native, 2) under DR but currently native_exec,
* 3) temporarily native b/c DR lost control (== UNDER_DYN_HACK), and
* 4) fully under DR
*/
DOSTATS({
if (IS_UNDER_DYN_HACK(tr->under_dynamo_control))
STATS_INC(num_asynch_while_lost);
});
return (tr->under_dynamo_control || IS_CLIENT_THREAD(tr->dcontext));
}
/* if tid != self, must hold thread_initexit_lock */
bool
intercept_asynch_for_thread(thread_id_t tid, bool intercept_unknown)
{
/* Needed to turn on and off asynchronous event interception
* for non-entire-application-under-dynamo-control situations
*/
thread_record_t *tr = thread_lookup(tid);
return intercept_asynch_common(tr, intercept_unknown);
}
bool
intercept_asynch_for_self(bool intercept_unknown)
{
/* To avoid problems with the all_threads_lock required to look
* up a thread in the thread table, we first see if it has a
* dcontext, and if so we get the thread_record_t from there.
* If not, it probably is a native thread and grabbing the lock
* should cause no problems as it should not currently be holding
* any locks.
*/
dcontext_t *dcontext = get_thread_private_dcontext();
if (dcontext != NULL)
return intercept_asynch_common(dcontext->thread_record, intercept_unknown);
else
return intercept_asynch_for_thread(get_thread_id(), intercept_unknown);
}
/***************************************************************************
* INTERCEPTION CODE FOR TRAMPOLINES INSERTED INTO APPLICATION CODE
interception code either assumes that the app's xsp is valid, or uses
dstack if available, or as a last resort uses initstack. when using
initstack, must make sure all paths exiting handler routine clear the
initstack mutex once not using the initstack itself!
We clobber TIB->PID, which is believed to be safe since no user-mode
code will execute there b/c thread is not alertable, and the kernel
shouldn't be reading (and trusting) user mode TIB structures.
FIXME: allocate and use a TIB scratch slot instead
N.B.: this interception code, if encountered by DR, is let run natively,
so make sure DR takes control at the end!
For trying to use the dstack, we have to be careful and check if we're
already on the dstack, which can happen for internal exceptions --
hopefully not for callbacks or apcs, we should assert on that =>
FIXME: add such checks to the cb and apc handlers, and split dstack
check as a separate parameter, once we make cbs and apcs not
assume_xsp (they still do for now since we haven't tested enough to
convince ourselves we never get them while on the dstack)
Unfortunately there's no easy way to check w/o modifying flags, so for
now we assume eflags whenever we do not assume xsp, unless we assume
we're not on the dstack.
Assumption should be ok for Ki*, also for Ldr*.
Alternatives: check later when we're in exception handler, only paths
there are terminate or forge exception. Thus we can get away w/o
reading anything on stack placed by kernel, but we won't have clean
call stack or anything else for diagnostics, and we'll have clobbered
the real xsp in the mcontext slot, which we use for forging the
exception.
Could perhaps use whereami==WHERE_FCACHE, but could get exception during
clean call or cxt switch when on dstack but prior to whereami change.
Note: the app registers passed to the handler are restored when going back to
the app, which means any changes made by the handler will be reflected
in the app state;
FIXME: change handler prototype to make all registers volatile so that the
compiler doesn't clobber them; for now it is the user's responsibility.
if (!assume_xsp)
mov xcx, fs:$PID_TIB_OFFSET # save xcx
mov fs:$TLS_DCONTEXT_OFFSET, xcx
jecxz no_local_stack
if (!assume_not_on_dstack)
# need to check if already on dstack
# assumes eflags!
mov $DSTACK(xcx), xcx
cmp xsp, xcx
jge not_on_dstack
lea -DYNAMORIO_STACK_SIZE(xcx), xcx
cmp xsp, xcx
jl not_on_dstack
# record stack method: using dstack/initstack unmodified
push xsp
push $2
jmp have_stack_now
not_on_dstack:
mov fs:$TLS_DCONTEXT_OFFSET, xcx
endif
# store app xsp in dcontext & switch to dstack; this will be used to save
# app xsp on the switched stack, i.e., dstack; not used after that.
if TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)
mov $MCONTEXT_OFFSET(xcx), xcx
endif
mov xsp, $XSP_OFFSET(xcx)
if TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)
mov fs:$TLS_DCONTEXT_OFFSET, xcx
endif
mov $DSTACK(xcx), xsp
# now get the app xsp from the dcontext and put it on the dstack; this
# will serve as the app xsp cache and will be used to send the correct
# app xsp to the handler and to restore app xsp at exit
if TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)
mov $MCONTEXT_OFFSET(xcx), xcx
endif
mov $XSP_OFFSET(xcx), xcx
push xcx
# need to record stack method, since dcontext could change in handler
push $1
jmp have_stack_now
no_local_stack:
# use initstack -- it's a global synch, but should only have no
# dcontext for initializing thread (where we actually use the app's
# stack) or exiting thread
# If we are already on the initstack, should just continue to use it.
# need to check if already on initstack
# assumes eflags, but we already did on this path for checking dstack
mov $INITSTACK, xcx
cmp xsp, xcx
jge grab_initstack
lea -DYNAMORIO_STACK_SIZE(xcx), xcx
cmp xsp, xcx
jl grab_initstack
push xsp
# record stack method: using dstack/initstack unmodified
push $2
jmp have_stack_now
grab_initstack:
mov $1, ecx # upper 32 bits zeroed on x64
if x64 # can't directly address initstack_mutex or initstack_app_xsp
# (though we could use rip-relative, nice to not have reachability issues
# if located far from dynamorio.dll, for general hooks (PR 250294)!)
# if a new thread we can't easily (w/o syscall) replace tid, so we use peb
mov xax, fs:$PEB_TIB_OFFSET # save xax
endif
get_lock:
if x64 # can't directly address initstack_mutex or initstack_app_xsp
mov $INITSTACK_MUTEX, xax
endif
# initstack_mutex.lock_requests is 32-bit
xchg ecx, IF_X64_ELSE((xax), initstack_mutex)
jecxz have_lock
pause # improve spin-loop perf on P4
jmp get_lock # no way to sleep or anything, must spin
have_lock:
# app xsp is saved in initstack_app_xsp only so that it can be accessed after
# switching to initstack; used only to set up the app xsp on the initstack
if x64 # we don't need to set initstack_app_xsp, just push the app xsp value
mov xsp, xcx
mov initstack, xax
xchg xax, xsp
push xcx
else
mov xsp, initstack_app_xsp
mov initstack, xsp
push initstack_app_xsp
endif
# need to record stack method, since dcontext could change in handler
push $0
if x64
mov $peb_ptr, xax
xchg fs:$PEB_TIB_OFFSET, xax # restore xax and peb ptr
endif
have_stack_now:
if x64
mov $global_pid, xcx
xchg fs:$PID_TIB_OFFSET, xcx # restore xcx and pid
else
mov fs:$PID_TIB_OFFSET, xcx # restore xcx
mov $global_pid, fs:$PID_TIB_OFFSET # restore TIB PID
endif
else
push xsp # cache app xsp so that it can be used to send the right value
# to the handler and to restore app xsp safely at exit
push $3 # recording stack type when using app stack
endif
# we assume here that we've done two pushes on the stack,
# which combined w/ the push0 and pushf give us 16-byte alignment
# for 32-bit and 64-bit prior to push-all-regs
clean_call_setup:
# lay out pc, eflags, regs, etc. in app_state_at_intercept_t order
push $0 # pc slot; unused; could use instead of state->start_pc
pushf
pusha (or push all regs for x64)
push $0 # ASSUMPTION: clearing, not preserving, is good enough
# FIXME: this won't work at CPL0 if we ever run there!
popf
# get the cached app xsp and write it to pusha location,
# so that the handler gets the correct app xsp
mov sizeof(priv_mcontext_t)+XSP_SZ(xsp), xax
mov xax, offsetof(priv_mcontext_t, xsp)(xsp)
if (ENTER_DR_HOOK != NULL)
call ENTER_DR_HOOK
endif
if x64
mov no_cleanup, xax
push xax
mov handler_arg, xax
push xax
else
push no_cleanup
push handler_arg
endif
# now we've laid out app_state_at_intercept_t on the stack
push/mov xsp # a pointer to the pushed values; this is the argument;
# see case 7597. may be passed in a register.
call handler
<clean up args>
lea 2*XSP_SZ(xsp), lea # pop handler_arg + no_cleanup
if (AFTER_INTERCEPT_DYNAMIC_DECISION)
cmp xax, AFTER_INTERCEPT_LET_GO
je let_go
if (alternate target provided)
cmp xax, AFTER_INTERCEPT_LET_GO_ALT_DYN
je let_go_alt
endif
endif
if (AFTER_INTERCEPT_TAKE_OVER || AFTER_INTERCEPT_TAKE_OVER_SINGLE_SHOT
|| AFTER_INTERCEPT_DYNAMIC_DECISION)
if x64
mov no_cleanup, xax
push xax
else
push no_cleanup # state->start_pc
endif
push $0 # we assume always want !save_dcontext as arg to asynch_take_over
push/mov xsp # app_state_at_intercept_t *
call asynch_take_over
# should never reach here
push $0
push $-3 # internal_error will report -3 as line number
push $0
call internal_error
endif
if (AFTER_INTERCEPT_DYNAMIC_DECISION && alternate target provided)
let_go_alt:
<complete duplicate of let_go, but ending in a jmp to alternate target>
<(cannot really share much of let_go cleanup w/o more scratch slots)>
<(has to be first since original app instrs are placed by caller, not us)>
endif
if (!AFTER_INTERCEPT_TAKE_OVER)
let_go:
if (EXIT_DR_HOOK != NULL)
call EXIT_DR_HOOK
endif
# get the xsp passed to the handler, which may have been
# changed; store it in the xsp cache to restore at exit
mov offsetof(priv_mcontext_t, xsp)(xsp), xax
mov xax, sizeof(priv_mcontext_t)+XSP_SZ(xsp)
popa # or pop all regs on x64
popf
lea XSP_SZ(xsp), xsp # clear pc slot
if (!assume_xsp)
mov xcx, fs:$PID_TIB_OFFSET # save xcx
pop xcx # get back const telling stack used
pop xsp
jecxz restore_initstack
jmp done_restoring
restore_initstack:
if x64
mov &initstack_mutex, xcx
mov $0, (xcx)
else
mov $0, initstack_mutex
endif
done_restoring:
if x64
mov $global_pid, xcx
xchg fs:$PID_TIB_OFFSET, xcx # restore xcx and pid
else
mov fs:$PID_TIB_OFFSET, xcx # restore xcx
mov $global_pid, fs:$PID_TIB_OFFSET # restore TIB PID
endif
else
lea XSP_SZ(xsp), xsp # clear out the stack type
pop xsp # handler may have changed xsp; so get it from the xsp cache
endif
endif (!AFTER_INTERCEPT_TAKE_OVER)
no_cleanup:
<original app instructions>
=> handler signature, exported as typedef intercept_function_t:
void handler(app_state_at_intercept_t *args)
if AFTER_INTERCEPT_TAKE_OVER, then asynch_take_over is called, with "false" for
its save_dcontext parameter
handler must make sure all paths exiting handler routine clear the
initstack mutex once not using the initstack itself!
*/
#define APP instrlist_append
/* common routine separate since used for let go and alternate let go */
static void
insert_let_go_cleanup(dcontext_t *dcontext, byte *pc, instrlist_t *ilist,
instr_t *decision, bool assume_xsp, bool assume_not_on_dstack,
after_intercept_action_t action_after)
{
instr_t *first = NULL;
if (action_after == AFTER_INTERCEPT_DYNAMIC_DECISION) {
/* placeholder so can find 1st of this path */
first = instrlist_last(ilist);
}
if (EXIT_DR_HOOK != NULL) {
/* make sure to use dr_insert_call() rather than a raw OP_call instr,
* since x64 windows requires 32 bytes of stack space even w/ no args.
*/
IF_DEBUG(bool direct = )
dr_insert_call_ex((void *)dcontext, ilist, NULL/*append*/,
/* we're not in vmcode, so avoid indirect call */
pc, (void *)EXIT_DR_HOOK, 0);
ASSERT(direct);
}
/* Get the app xsp passed to the handler from the popa location and store
* it in the app xsp cache; this is because the handler could have changed
* the app xsp that was passed to it. CAUTION: do this before the popa.
*/
APP(ilist, INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XAX),
OPND_CREATE_MEMPTR(REG_XSP,
offsetof(priv_mcontext_t, xsp))));
APP(ilist, INSTR_CREATE_mov_st(dcontext,
OPND_CREATE_MEMPTR(REG_XSP,
sizeof(priv_mcontext_t)+XSP_SZ),
opnd_create_reg(REG_XAX)));
/* now restore everything */
insert_pop_all_registers(dcontext, NULL, ilist, NULL, XSP_SZ/*see push_all use*/);
if (action_after == AFTER_INTERCEPT_DYNAMIC_DECISION) {
/* now that instrs are there, take 1st */
ASSERT(first != NULL);
instr_set_target(decision, opnd_create_instr(instr_get_next(first)));
}
if (!assume_xsp) {
instr_t *restore_initstack = INSTR_CREATE_label(dcontext);
instr_t *done_restoring = INSTR_CREATE_label(dcontext);
APP(ilist,
INSTR_CREATE_mov_st(dcontext,
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL,
0, PID_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(REG_XCX)));
APP(ilist,
INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_XCX)));
/* popa doesn't restore xsp; the handler might have changed it, so
* restore it from the app xsp cache, which is now the top of stack.
*/
APP(ilist, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_XSP)));
APP(ilist, INSTR_CREATE_jecxz(dcontext, opnd_create_instr(restore_initstack)));
APP(ilist,
INSTR_CREATE_jmp(dcontext, opnd_create_instr(done_restoring)));
/* use initstack to avoid any assumptions about app xsp */
APP(ilist, restore_initstack);
#ifdef X64
APP(ilist,
INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XCX),
OPND_CREATE_INTPTR((ptr_uint_t)&initstack_mutex)));
#endif
APP(ilist,
INSTR_CREATE_mov_st(dcontext,
IF_X64_ELSE(OPND_CREATE_MEM32(REG_XCX, 0),
OPND_CREATE_ABSMEM((void *)&initstack_mutex,
OPSZ_4)),
OPND_CREATE_INT32(0)));
APP(ilist, done_restoring);
#ifdef X64
/* we could perhaps assume the top 32 bits of win32_pid are zero, but
* xchg works just as well */
APP(ilist,
INSTR_CREATE_mov_imm(dcontext,
opnd_create_reg(REG_XCX),
OPND_CREATE_INTPTR((ptr_uint_t)win32_pid)));
APP(ilist,
INSTR_CREATE_xchg(dcontext,
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL,
0, PID_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(REG_XCX)));
#else
APP(ilist,
INSTR_CREATE_mov_ld(dcontext,
opnd_create_reg(REG_XCX),
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL,
0, PID_TIB_OFFSET, OPSZ_PTR)));
APP(ilist,
INSTR_CREATE_mov_st(dcontext,
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL,
0, PID_TIB_OFFSET, OPSZ_PTR),
OPND_CREATE_INTPTR(win32_pid)));
#endif
} else {
/* popa doesn't restore xsp; the handler might have changed it, so
* restore it from the app xsp cache, which is now the top of stack.
*/
APP(ilist, INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP),
opnd_create_base_disp(REG_XSP, REG_NULL, 0,
XSP_SZ, OPSZ_0)));
APP(ilist, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_XSP)));
}
}
/* Emits a landing pad (shown below) and returns the address to the first
* instruction in it. Also returns the address where displaced app
* instrs should be copied in displaced_app_loc.
*
* The caller must call finalize_landing_pad_code() once finished copying
* the displaced app code, passing in the changed_prot value it received
* from this routine.
*
* CAUTION: These landing pad layouts are assumed in intercept_call() and in
* read_and_verify_dr_marker(), must_not_be_elided(), and
* is_syscall_trampoline().
*ifndef X64
* 32-bit landing pad:
* jmp tgt_pc ; 5 bytes, 32-bit relative jump
* displaced app instr(s) ; < (JMP_LONG_LENGTH + MAX_INSTR_LENGTH) bytes
* jmp after_hook_pc ; 5 bytes, 32-bit relative jump
*else
* 64-bit landing pad:
* tgt_pc ; 8 bytes of absolute address, i.e., tgt_pc
* jmp [tgt_pc] ; 6 bytes, 64-bit absolute indirect jmp
* displaced app instr(s) ; < (JMP_LONG_LENGTH + MAX_INSTR_LENGTH) bytes
* jmp after_hook_pc ; 5 bytes, 32-bit relative jump
*endif
*
* Note: For 64-bit landing pad, tgt_pc can be stored at the bottom of the
* trampoline too. I chose the top because it helps avoid a minor reachability
* problem: iff the landing pad is allocated at the topmost part of the
* reachability region for a given addr_to_hook, then there is a possibility
* that the return jmp from the landing pad may not reach the instruction after
* the hook address. This is because the number of bytes of the hook (5 bytes)
* and the number of bytes of the instruction(s) clobbered at the hook point
* might be different. If the clobbered bytes are more than 5 bytes, then the
* return jmp from the landing pad won't be able to reach it. By placing 8
* bytes above the landing pad, we give it the extra reachability needed.
* Also, having the tgt_pc at the top of the landing pad makes it easy to see
* the disassembly of the whole landing pad while debugging, else there will be
* jmp and garbage after it.
*
* This isn't a problem for 32-bit landing pad because in 32-bit everything is
* reachable.
*
* We must put the displaced app instr(s) in the landing pad for x64
* b/c they may contain rip-rel data refs and those may not reach if
* in the main trampoline (i#902).
*
* See heap.c for details about what landing pads are.
*/
#define JMP_SIZE (IF_X64_ELSE(JMP_ABS_IND64_SIZE, JMP_REL32_SIZE))
static byte *
emit_landing_pad_code(byte *lpad_buf, const byte *tgt_pc,
const byte *after_hook_pc,
size_t displaced_app_size,
byte **displaced_app_loc OUT,
bool *changed_prot)
{
byte *lpad_entry = lpad_buf;
bool res;
byte *lpad_start = lpad_buf;
ASSERT(lpad_buf != NULL);
res = make_hookable(lpad_buf, LANDING_PAD_SIZE, changed_prot);
ASSERT(res);
#ifndef X64
*lpad_buf = JMP_REL32_OPCODE;
lpad_buf++;
*((int *)lpad_buf) = (int)(tgt_pc - lpad_buf - 4);
lpad_buf += 4;
#else
*((byte **)lpad_buf) = (byte *)tgt_pc; /* save tgt_pc for the rip-rel jmp */
lpad_buf += sizeof(tgt_pc);
lpad_entry = lpad_buf; /* entry is after the first 8 bytes */
*lpad_buf = JMP_ABS_IND64_OPCODE;
lpad_buf++;
*lpad_buf = JMP_ABS_MEM_IND64_MODRM;
lpad_buf++;
/* rip relative address to 8-bytes, i.e., start of lpad_buf */
*((int *)lpad_buf) = -(int)(JMP_ABS_IND64_SIZE + sizeof(tgt_pc));
lpad_buf += 4;
#endif
/* Leave space for the displaced app code */
ASSERT(displaced_app_size < MAX_HOOK_DISPLACED_LENGTH);
ASSERT(displaced_app_loc != NULL);
*displaced_app_loc = lpad_buf;
lpad_buf += displaced_app_size;
/* The return 32-bit relative jump is common to both 32-bit and 64-bit
* landing pads. Make sure that the second jmp goes into the right address.
*/
ASSERT((size_t)(lpad_buf - lpad_start) ==
JMP_SIZE IF_X64(+ sizeof(tgt_pc)) + displaced_app_size);
*lpad_buf = JMP_REL32_OPCODE;
lpad_buf++;
*((int *)lpad_buf) = (int)(after_hook_pc - lpad_buf - 4);
lpad_buf += 4;
/* Even though we have the 8 byte space up front for 64-bit, just make sure
* that the return jmp can reach the instruction after the hook.
*/
ASSERT(REL32_REACHABLE(lpad_buf, after_hook_pc));
/* Make sure that the landing pad size match with definitions. */
ASSERT(lpad_buf - lpad_start <= LANDING_PAD_SIZE);
/* Return unused space */
trim_landing_pad(lpad_start, lpad_buf - lpad_start);
return lpad_entry;
}
static void
finalize_landing_pad_code(byte *lpad_buf, bool changed_prot)
{
make_unhookable(lpad_buf, LANDING_PAD_SIZE, changed_prot);
}
/* Assumes that ilist contains decoded instrs for [start_pc, start_pc+size).
* Copies size bytes of the app code at start_pc into buf by encoding
* the ilist, re-relativizing rip-relative and ctis as it goes along.
* Also converts short ctis into 32-bit-offset ctis.
*
* hotp_only does not support ctis in the middle of the ilist, only at
* the end, nor size changes in the middle of the ilist: to support
* that we'd need a relocation table mapping old instruction offsets
* to the newly emitted ones.
*
* As of today only one cti is allowed in a patch region and that too at
* the end of it, so the starting location of that cti won't change even if we
* convert and re-relativize it. This means hot patch control flow changes into
* the middle of a patch region won't have to worry about using an offset table.
*
* The current patch region definition doesn't allow ctis to be in the
* middle of patch regions. This means we don't have to worry about
* re-relativizing ctis in the middle of a patch region. However Alex has an
* argument about allowing cbrs to be completely inside a patch region as
* control flow can never reach the following instruction other than fall
* through, i.e., not from outside. This is a matter for debate, but one
* which will need the ilist & creating the relocation table per patch point.
*/
static byte *
copy_app_code(dcontext_t *dcontext, const byte *start_pc,
byte *buf, size_t size, instrlist_t *ilist)
{
instr_t *instr;
byte *buf_nxt;
DEBUG_DECLARE(byte *buf_start = buf;)
DEBUG_DECLARE(bool size_change = false;)
ASSERT(dcontext != NULL && start_pc != NULL && buf != NULL);
/* Patch region should be at least 5 bytes in length, but no more than 5
* plus the length of the last instruction in the region.
*/
ASSERT(size >= 5 && size <
(size_t)(5 + instr_length(dcontext, instrlist_last(ilist))));
/* We have to walk the instr list to lengthen short (8-bit) ctis */
for (instr = instrlist_first(ilist); instr != NULL; instr = instr_get_next(instr)) {
/* For short ctis in the loop to jecxz range, the cti conversion
* will set the target in the raw bits, so the raw bits will be valid.
* For other short ctis, the conversion will invalidate the raw bits,
* so a full encoding is enforced. For other ctis, the raw bits aren't
* valid for encoding because we are relocating them; so invalidate
* them explicitly.
*/
if (instr_opcode_valid(instr) && instr_is_cti(instr)) {
if (instr_is_cti_short(instr)) {
DODEBUG({ size_change = true; });
convert_to_near_rel(dcontext, instr);
} else
instr_set_raw_bits_valid(instr, false);
/* see notes above: hotp_only doesn't support non-final cti */
ASSERT(!instr_is_cti(instr) || instr == instrlist_last(ilist));
}
#ifdef X64
/* If we have reachability issues, instrlist_encode() below
* will fail. We try to do an assert here for that case
* (estimating where the relative offset will be encoded at).
* PR 250294's heap pad proposal will solve this.
*/
DOCHECK(1, {
app_pc target;
instr_get_rel_addr_target(instr, &target);
ASSERT_NOT_IMPLEMENTED
((!instr_has_rel_addr_reference(instr) ||
REL32_REACHABLE(buf, target)) &&
"PR 250294: displaced code too far from rip-rel target");
});
#endif
}
/* now encode and re-relativize x64 rip-relative instructions */
buf_nxt = instrlist_encode(dcontext, ilist, buf, false/*no instr_t targets*/);
ASSERT(buf_nxt != NULL);
ASSERT((buf_nxt - buf) == (ssize_t)size ||
size_change && (buf_nxt - buf) > (ssize_t)size);
return buf_nxt;
}
/* N.B.: !assume_xsp && !assume_not_on_dstack implies eflags assumptions!
* !assume_xsp && assume_not_on_dstack does not assume eflags.
* Could optimize by having a bool indicating whether to have a callee arg or not,
* but then the intercept_function_t typedef must be void, or must have two, so we
* just make every callee take an arg.
*
* Currently only hotp_only uses alt_after_tgt_p. It points at the pointer-sized
* target that initially has the value alternate_after. It is NOT intra-cache-line
* aligned and thus if the caller wants a hot-patchable target it must
* have another layer of indirection.
*/
static byte *
emit_intercept_code(dcontext_t *dcontext, byte *pc, intercept_function_t callee,
void *callee_arg, bool assume_xsp, bool assume_not_on_dstack,
after_intercept_action_t action_after, byte *alternate_after,
byte **alt_after_tgt_p OUT)
{
instrlist_t ilist;
instr_t *inst, *push_start, *push_start2 = NULL;
instr_t *decision = NULL, *alt_decision = NULL, *alt_after = NULL;
uint len;
byte *start_pc, *push_pc, *push_pc2 = NULL;
app_pc no_cleanup;
uint stack_offs = 0;
IF_DEBUG(bool direct;)
/* AFTER_INTERCEPT_LET_GO_ALT_DYN is used only dynamically to select alternate */
ASSERT(action_after != AFTER_INTERCEPT_LET_GO_ALT_DYN);
/* alternate_after provided only when possibly using alternate target */
ASSERT(alternate_after == NULL ||
action_after == AFTER_INTERCEPT_DYNAMIC_DECISION ||
action_after == AFTER_INTERCEPT_TAKE_OVER_SINGLE_SHOT);
/* initialize the ilist */
instrlist_init(&ilist);
if (!assume_xsp) {
instr_t *no_local_stack = INSTR_CREATE_label(dcontext);
instr_t *grab_initstack = INSTR_CREATE_label(dcontext);
instr_t *get_lock = INSTR_CREATE_label(dcontext);
instr_t *have_lock = INSTR_CREATE_label(dcontext);
instr_t *have_stack_now = INSTR_CREATE_label(dcontext);
APP(&ilist,
INSTR_CREATE_mov_st(dcontext,
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL,
0, PID_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(REG_XCX)));
APP(&ilist,
INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XCX),
opnd_create_tls_slot(os_tls_offset(TLS_DCONTEXT_SLOT))));
APP(&ilist, INSTR_CREATE_jecxz(dcontext, opnd_create_instr(no_local_stack)));
if (!assume_not_on_dstack) {
instr_t *not_on_dstack = INSTR_CREATE_label(dcontext);
APP(&ilist,
instr_create_restore_from_dc_via_reg(dcontext, REG_XCX, REG_XCX,
DSTACK_OFFSET));
APP(&ilist,
INSTR_CREATE_cmp(dcontext, opnd_create_reg(REG_XSP),
opnd_create_reg(REG_XCX)));
APP(&ilist,
INSTR_CREATE_jcc(dcontext, OP_jge, opnd_create_instr(not_on_dstack)));
APP(&ilist,
INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XCX),
opnd_create_base_disp(REG_XCX, REG_NULL, 0,
-(int)DYNAMORIO_STACK_SIZE, OPSZ_0)));
APP(&ilist,
INSTR_CREATE_cmp(dcontext, opnd_create_reg(REG_XSP),
opnd_create_reg(REG_XCX)));
APP(&ilist,
INSTR_CREATE_jcc(dcontext, OP_jl, opnd_create_instr(not_on_dstack)));
APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XSP)));
APP(&ilist,
INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(2)));
APP(&ilist,
INSTR_CREATE_jmp(dcontext, opnd_create_instr(have_stack_now)));
APP(&ilist, not_on_dstack);
APP(&ilist, INSTR_CREATE_mov_ld
(dcontext, opnd_create_reg(REG_XCX),
opnd_create_tls_slot(os_tls_offset(TLS_DCONTEXT_SLOT))));
}
/* Store the app xsp in dcontext and switch to dstack. */
if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) {
APP(&ilist,
instr_create_restore_from_dc_via_reg(dcontext, REG_XCX, REG_XCX,
PROT_OFFS));
}
APP(&ilist,
instr_create_save_to_dc_via_reg(dcontext, REG_XCX, REG_XSP, XSP_OFFSET));
if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) {
APP(&ilist, INSTR_CREATE_mov_ld
(dcontext, opnd_create_reg(REG_XCX),
opnd_create_tls_slot(os_tls_offset(TLS_DCONTEXT_SLOT))));
}
APP(&ilist,
instr_create_restore_from_dc_via_reg(dcontext, REG_XCX, REG_XSP,
DSTACK_OFFSET));
/* Get the app xsp from the dcontext and put it on the dstack to serve
* as the app xsp cache.
*/
if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) {
APP(&ilist,
instr_create_restore_from_dc_via_reg(dcontext,REG_XCX, REG_XCX,
PROT_OFFS));
}
APP(&ilist,
instr_create_restore_from_dc_via_reg(dcontext, REG_XCX, REG_XCX, XSP_OFFSET));
APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XCX)));
APP(&ilist,
INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(1)));
APP(&ilist,
INSTR_CREATE_jmp(dcontext, opnd_create_instr(have_stack_now)));
/* use initstack to avoid any assumptions about app xsp */
/* first check if we are already on it */
APP(&ilist, no_local_stack);
APP(&ilist,
INSTR_CREATE_mov_imm(dcontext,
opnd_create_reg(REG_XCX),
OPND_CREATE_INTPTR((ptr_int_t)initstack)));
APP(&ilist,
INSTR_CREATE_cmp(dcontext, opnd_create_reg(REG_XSP),
opnd_create_reg(REG_XCX)));
APP(&ilist,
INSTR_CREATE_jcc(dcontext, OP_jge, opnd_create_instr(grab_initstack)));
APP(&ilist,
INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XCX),
opnd_create_base_disp(REG_XCX, REG_NULL, 0,
-(int)DYNAMORIO_STACK_SIZE, OPSZ_0)));
APP(&ilist,
INSTR_CREATE_cmp(dcontext, opnd_create_reg(REG_XSP),
opnd_create_reg(REG_XCX)));
APP(&ilist,
INSTR_CREATE_jcc(dcontext, OP_jl, opnd_create_instr(grab_initstack)));
APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XSP)));
APP(&ilist,
INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(2)));
APP(&ilist,
INSTR_CREATE_jmp(dcontext, opnd_create_instr(have_stack_now)));
APP(&ilist, grab_initstack);
APP(&ilist,
INSTR_CREATE_mov_imm(dcontext,
/* on x64 the upper 32 bits will be zeroed for us */
opnd_create_reg(REG_ECX), OPND_CREATE_INT32(1)));
#ifdef X64
APP(&ilist,
INSTR_CREATE_mov_st(dcontext,
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL,
0, PEB_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(REG_XAX)));
#endif
APP(&ilist, get_lock);
#ifdef X64
APP(&ilist,
INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XAX),
OPND_CREATE_INTPTR((ptr_uint_t)&initstack_mutex)));
#endif
APP(&ilist,
INSTR_CREATE_xchg(dcontext,
/* initstack_mutex is 32 bits always */
IF_X64_ELSE(OPND_CREATE_MEM32(REG_XAX, 0),
OPND_CREATE_ABSMEM((void *)&initstack_mutex,
OPSZ_4)),
opnd_create_reg(REG_ECX)));
APP(&ilist,
INSTR_CREATE_jecxz(dcontext, opnd_create_instr(have_lock)));
APP(&ilist,
INSTR_CREATE_pause(dcontext));
APP(&ilist,
INSTR_CREATE_jmp(dcontext, opnd_create_instr(get_lock)));
APP(&ilist, have_lock);
APP(&ilist,
INSTR_CREATE_mov_st(dcontext,
IF_X64_ELSE(opnd_create_reg(REG_XCX),
OPND_CREATE_ABSMEM((void *)&initstack_app_xsp,
OPSZ_PTR)),
opnd_create_reg(REG_XSP)));
#ifdef X64
/* we can do a 64-bit absolute address into xax only */
APP(&ilist,
INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XAX),
OPND_CREATE_ABSMEM((void *)&initstack, OPSZ_PTR)));
APP(&ilist,
INSTR_CREATE_xchg(dcontext, opnd_create_reg(REG_XSP),
opnd_create_reg(REG_XAX)));
#else
APP(&ilist,
INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XSP),
OPND_CREATE_ABSMEM((void *)&initstack, OPSZ_PTR)));
#endif
APP(&ilist,
INSTR_CREATE_push(dcontext,
IF_X64_ELSE(opnd_create_reg(REG_XCX),
OPND_CREATE_ABSMEM((void *)&initstack_app_xsp,
OPSZ_PTR))));
APP(&ilist,
INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(0)));
#ifdef X64
APP(&ilist,
INSTR_CREATE_mov_imm(dcontext,
opnd_create_reg(REG_XAX),
OPND_CREATE_INTPTR((ptr_uint_t)peb_ptr)));
APP(&ilist,
INSTR_CREATE_xchg(dcontext,
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL,
0, PEB_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(REG_XAX)));
#endif
APP(&ilist, have_stack_now);
#ifdef X64
/* we could perhaps assume the top 32 bits of win32_pid are zero, but
* xchg works just as well */
APP(&ilist,
INSTR_CREATE_mov_imm(dcontext,
opnd_create_reg(REG_XCX),
OPND_CREATE_INTPTR((ptr_uint_t)win32_pid)));
APP(&ilist,
INSTR_CREATE_xchg(dcontext,
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL,
0, PID_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(REG_XCX)));
#else
APP(&ilist,
INSTR_CREATE_mov_ld(dcontext,
opnd_create_reg(REG_XCX),
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL,
0, PID_TIB_OFFSET, OPSZ_PTR)));
APP(&ilist,
INSTR_CREATE_mov_st(dcontext,
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL,
0, PID_TIB_OFFSET, OPSZ_PTR),
OPND_CREATE_INTPTR(win32_pid)));
#endif /* X64 */
} else { /* assume_xsp */
/* Cache app xsp so that the right value can be passed to the handler
* and to restore at exit. Push stack type too: 3 for app stack.
*/
APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XSP)));
APP(&ilist, INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(3)));
}
/* We assume that if !assume_xsp we've done two pushes on the stack.
* DR often only cares about stack alignment for xmm saves.
* However, it sometimes calls ntdll routines; and for client exception
* handlers that might call random library routines we really care.
* We assume that the kernel will make sure of the stack alignment,
* so we use stack_offs to make sure of the stack alignment in the
* instrumentation.
*/
stack_offs = insert_push_all_registers
(dcontext, NULL, &ilist, NULL, XSP_SZ,
/* pc slot not used: could use instead of state->start_pc */
/* sign-extended */
INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(0)));
/* clear eflags for callee's usage */
APP(&ilist,
INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(0)));
APP(&ilist, INSTR_CREATE_RAW_popf(dcontext));
/* Get the cached app xsp and update the pusha's xsp with it; this is the
* right app xsp.
*/
APP(&ilist,
INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XAX),
OPND_CREATE_MEMPTR(REG_XSP, /* mcxt + stack type */
sizeof(priv_mcontext_t)+XSP_SZ)));
APP(&ilist,
INSTR_CREATE_mov_st(dcontext,
OPND_CREATE_MEMPTR(REG_XSP,
offsetof(priv_mcontext_t, xsp)),
opnd_create_reg(REG_XAX)));
/* FIXME: don't want hooks for trampolines that run natively like
* LdrLoadDll or image entry, right?
*/
if (ENTER_DR_HOOK != NULL) {
/* make sure to use dr_insert_call() rather than a raw OP_call instr,
* since x64 windows requires 32 bytes of stack space even w/ no args.
*/
IF_DEBUG(direct = )
dr_insert_call_ex((void *)dcontext, &ilist, NULL/*append*/,
/* we're not in vmcode, so avoid indirect call */
pc, (void *)ENTER_DR_HOOK, 0);
ASSERT(direct);
}
/* these are part of app_state_at_intercept_t struct so we have to
* push them on the stack, rather than pass in registers
*/
/* will fill in immed with no_cleanup pointer later */
#ifdef X64
push_start = INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XAX),
OPND_CREATE_INTPTR(0));
APP(&ilist, push_start);
APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XAX)));
APP(&ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XAX),
OPND_CREATE_INTPTR(callee_arg)));
APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XAX)));
#else
push_start = INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INTPTR(0));
APP(&ilist, push_start);
APP(&ilist, INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INTPTR(callee_arg)));
#endif
stack_offs += 2*XSP_SZ;
/* We pass xsp as a pointer to all the values on the stack; this is the actual
* argument to the intercept routine. Fix for case 7597.
* -- CAUTION -- if app_state_at_intercept_t changes in anyway, this can
* blow up! That structure's field's types, order & layout are assumed
* here. These two should change only in synch.
*/
if (parameters_stack_padded()) {
/* xsp won't have proper value due to stack padding */
APP(&ilist, INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XAX),
opnd_create_reg(REG_XSP)));
#ifdef X64
/* i#331: align the misaligned stack */
# define STACK_ALIGNMENT 16
if (!ALIGNED(stack_offs, STACK_ALIGNMENT)) {
ASSERT(ALIGNED(stack_offs, XSP_SZ));
APP(&ilist, INSTR_CREATE_lea
(dcontext, opnd_create_reg(REG_XSP),
opnd_create_base_disp(REG_XSP, REG_NULL, 0,
-(int)XSP_SZ, OPSZ_0)));
}
#endif
}
IF_DEBUG(direct = )
dr_insert_call_ex(dcontext, &ilist, NULL,
/* we're not in vmcode, so avoid indirect call */
pc, (byte *)callee, 1,
parameters_stack_padded() ? opnd_create_reg(REG_XAX) :
opnd_create_reg(REG_XSP));
ASSERT(direct);
#ifdef X64
/* i#331, misaligned stack adjustment cleanup */
if (parameters_stack_padded()) {
if (!ALIGNED(stack_offs, STACK_ALIGNMENT)) {
ASSERT(ALIGNED(stack_offs, XSP_SZ));
APP(&ilist, INSTR_CREATE_lea
(dcontext, opnd_create_reg(REG_XSP),
opnd_create_base_disp(REG_XSP, REG_NULL, 0,
XSP_SZ, OPSZ_0)));
}
}
#endif
/* clean up 2 pushes */
APP(&ilist, INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP),
opnd_create_base_disp(REG_XSP, REG_NULL, 0,
2*XSP_SZ, OPSZ_0)));
if (action_after == AFTER_INTERCEPT_DYNAMIC_DECISION) {
/* our 32-bit immed will be sign-extended.
* perhaps we could assume upper bits not set and use eax to save a rex.w.
*/
APP(&ilist, INSTR_CREATE_cmp(dcontext, opnd_create_reg(REG_XAX),
OPND_CREATE_INT32(AFTER_INTERCEPT_LET_GO)));
/* will fill in later */
decision = INSTR_CREATE_jcc(dcontext, OP_je, opnd_create_instr(NULL));
APP(&ilist, decision);
if (alternate_after != NULL) {
APP(&ilist, INSTR_CREATE_cmp
(dcontext, opnd_create_reg(REG_XAX),
OPND_CREATE_INT32(AFTER_INTERCEPT_LET_GO_ALT_DYN))); /*sign-extended*/
/* will fill in later */
alt_decision = INSTR_CREATE_jcc(dcontext, OP_je, opnd_create_instr(NULL));
APP(&ilist, alt_decision);
}
}
if (action_after == AFTER_INTERCEPT_TAKE_OVER ||
action_after == AFTER_INTERCEPT_TAKE_OVER_SINGLE_SHOT ||
action_after == AFTER_INTERCEPT_DYNAMIC_DECISION) {
/* will fill in immed with no_cleanup pointer later */
#ifdef X64
push_start2 = INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XAX),
OPND_CREATE_INTPTR(0));
APP(&ilist, push_start2);
APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XAX)));
#else
push_start2 = INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INTPTR(0));
APP(&ilist, push_start2);
#endif
APP(&ilist, INSTR_CREATE_push_imm(dcontext,
OPND_CREATE_INT32(0/*don't save dcontext*/)));
if (parameters_stack_padded()) {
/* xsp won't have proper value due to stack padding */
APP(&ilist, INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XAX),
opnd_create_reg(REG_XSP)));
#ifdef X64
/* i#331: align the misaligned stack */
APP(&ilist, INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP),
opnd_create_base_disp(REG_XSP, REG_NULL, 0,
-(int)XSP_SZ, OPSZ_0)));
#endif
}
IF_DEBUG(direct = )
dr_insert_call_ex(dcontext, &ilist, NULL,
/* we're not in vmcode, so avoid indirect call */
pc, (app_pc)asynch_take_over, 1,
parameters_stack_padded() ? opnd_create_reg(REG_XAX) :
opnd_create_reg(REG_XSP));
ASSERT(direct);
#ifdef INTERNAL
IF_DEBUG(direct = )
dr_insert_call_ex(dcontext, &ilist, NULL,
/* we're not in vmcode, so avoid indirect call */
pc, (app_pc)internal_error, 3,
OPND_CREATE_INTPTR(0),
OPND_CREATE_INT32(-3),
OPND_CREATE_INTPTR(0));
ASSERT(direct);
#endif
#ifdef X64
if (parameters_stack_padded()) {
/* i#331: misaligned stack adjust cleanup*/
APP(&ilist, INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP),
opnd_create_base_disp(REG_XSP, REG_NULL, 0,
XSP_SZ, OPSZ_0)));
}
#endif
}
if (action_after == AFTER_INTERCEPT_LET_GO ||
action_after == AFTER_INTERCEPT_DYNAMIC_DECISION) {
if (alternate_after != NULL) {
byte *encode_pc;
insert_let_go_cleanup(dcontext, pc, &ilist, alt_decision,
assume_xsp, assume_not_on_dstack, action_after);
/* alternate after cleanup target */
/* if alt_after_tgt_p != NULL we always do pointer-sized even if
* the initial target happens to reach
*/
/* we assert below we're < PAGE_SIZE for reachability test */
encode_pc = (alt_after_tgt_p != NULL) ? vmcode_unreachable_pc() : pc;
IF_DEBUG(direct = )
insert_reachable_cti(dcontext, &ilist, NULL, encode_pc,
alternate_after, true/*jmp*/, false/*!precise*/,
DR_REG_NULL/*no scratch*/, &alt_after);
ASSERT(alt_after_tgt_p == NULL || !direct);
}
/* the normal let_go target */
insert_let_go_cleanup(dcontext, pc, &ilist, decision,
assume_xsp, assume_not_on_dstack, action_after);
}
/* now encode the instructions */
/* must set note fields first with offset */
len = 0;
push_pc = NULL;
for (inst = instrlist_first(&ilist); inst; inst = instr_get_next(inst)) {
instr_set_note(inst, (void *)(ptr_int_t)len);
len += instr_length(dcontext, inst);
}
start_pc = pc;
for (inst = instrlist_first(&ilist); inst; inst = instr_get_next(inst)) {
pc = instr_encode(dcontext, inst, pc);
ASSERT(pc != NULL);
if (inst == push_start)
push_pc = (pc - sizeof(ptr_uint_t));
if (inst == push_start2)
push_pc2 = (pc - sizeof(ptr_uint_t));
if (inst == alt_after && alt_after_tgt_p != NULL)
*alt_after_tgt_p = pc - sizeof(alternate_after);
}
/* now can point start_pc arg of callee at beyond-cleanup pc */
if (action_after == AFTER_INTERCEPT_TAKE_OVER_SINGLE_SHOT) {
/* Note the interface here allows any target. Yet as the name
* suggests it should mainly be used to directly transfer to
* the now restored trampoline target.
*/
ASSERT(alternate_after != NULL);
no_cleanup = alternate_after;
} else {
/* callers are supposed to append the original target prefix */
no_cleanup = pc;
}
ASSERT(push_pc != NULL);
*((ptr_uint_t*)push_pc) = (ptr_uint_t)no_cleanup;
if (push_pc2 != NULL)
*((ptr_uint_t*)push_pc2) = (ptr_uint_t)no_cleanup;
ASSERT(pc - start_pc < PAGE_SIZE && "adjust REL32_REACHABLE for alternate_after");
/* free the instrlist_t elements */
instrlist_clear(dcontext, &ilist);
return pc;
}
#undef APP
static void
map_intercept_pc_to_app_pc(byte *interception_pc, app_pc original_app_pc,
size_t displace_length, size_t orig_length)
{
intercept_map_elem_t *elem = HEAP_TYPE_ALLOC
(GLOBAL_DCONTEXT, intercept_map_elem_t, ACCT_OTHER, UNPROTECTED);
elem->interception_pc = interception_pc;
elem->original_app_pc = original_app_pc;
elem->displace_length = displace_length;
elem->orig_length = orig_length;
elem->next = NULL;
mutex_lock(&map_intercept_pc_lock);
if (intercept_map->head == NULL) {
intercept_map->head = elem;
intercept_map->tail = elem;
}
else {
intercept_map->tail->next = elem;
intercept_map->tail = elem;
}
mutex_unlock(&map_intercept_pc_lock);
}
static void
unmap_intercept_pc(app_pc original_app_pc)
{
intercept_map_elem_t *curr, *prev, *next;
mutex_lock(&map_intercept_pc_lock);
prev = NULL;
curr = intercept_map->head;
while (curr != NULL) {
next = curr->next;
if (curr->original_app_pc == original_app_pc) {
if (prev != NULL) {
prev->next = curr->next;
}
if (curr == intercept_map->head) {
intercept_map->head = curr->next;
}
if (curr == intercept_map->tail) {
intercept_map->tail = prev;
}
HEAP_TYPE_FREE(GLOBAL_DCONTEXT, curr, intercept_map_elem_t,
ACCT_OTHER, UNPROTECTED);
/* We don't break b/c we allow multiple entries and in fact
* we have multiple today: one for displaced app code and
* one for the jmp from interception buffer to landing pad.
*/
} else
prev = curr;
curr = next;
}
mutex_unlock(&map_intercept_pc_lock);
}
static void
free_intercept_list(void)
{
/* For all regular hooks, un_intercept_call() calls unmap_intercept_pc()
* and removes the hook's entry. But syscall wrappers have a target app
* pc that's unusual. Rather than store it for each, we just tear
* down the whole list.
*/
intercept_map_elem_t *curr;
mutex_lock(&map_intercept_pc_lock);
while (intercept_map->head != NULL) {
curr = intercept_map->head;
intercept_map->head = curr->next;
HEAP_TYPE_FREE(GLOBAL_DCONTEXT, curr, intercept_map_elem_t,
ACCT_OTHER, UNPROTECTED);
}
intercept_map->head = NULL;
intercept_map->tail = NULL;
mutex_unlock(&map_intercept_pc_lock);
}
/* We assume no mangling of code placed in the interception buffer,
* other than re-relativizing ctis. As such, we can uniquely correlate
* interception buffer PCs to their original app PCs.
* Caller must check that pc is actually in the intercept buffer (or landing
* pad displaced app code or jmp back).
*/
app_pc
get_app_pc_from_intercept_pc(byte *pc)
{
intercept_map_elem_t *iter = intercept_map->head;
while (iter != NULL) {
byte *start = iter->interception_pc;
byte *end = start + iter->displace_length;
if (pc >= start && pc < end) {
/* include jmp back but map it to instr after displacement */
if ((size_t)(pc - start) > iter->orig_length)
return iter->original_app_pc + iter->orig_length;
else
return iter->original_app_pc + (pc - start);
}
iter = iter->next;
}
ASSERT_NOT_REACHED();
return NULL;
}
bool
is_intercepted_app_pc(app_pc pc, byte **interception_pc)
{
intercept_map_elem_t *iter = intercept_map->head;
while (iter != NULL) {
/* i#268: respond for any pc not just the first.
* FIXME: do we handle app targeting middle of hook?
* I'm assuming here that we would not create another
* entry for that start and it's ok to not match only start.
*/
if (pc >= iter->original_app_pc &&
pc < iter->original_app_pc + iter->orig_length) {
/* PR 219351: For syscall trampolines, while building bbs we replace
* the jmp and never execute from the displaced app code in the
* buffer, so the bb looks normal. FIXME: should we just not add to
* the map? For now, better safe than sorry so
* get_app_pc_from_intercept_pc will work in case we ever ask about
* that displaced app code.
*/
if (is_syscall_trampoline(iter->interception_pc, NULL))
return false;
if (interception_pc != NULL)
*interception_pc = iter->interception_pc + (pc - iter->original_app_pc);
return true;
}
iter = iter->next;
}
return false;
}
/* Emits a jmp at pc to resume_pc. If pc is in the interception buffer,
* adds a map entry from [xl8_start_pc, return value here) to
* [app_pc, <same size>).
*/
static byte *
emit_resume_jmp(byte *pc, byte *resume_pc, byte *app_pc, byte *xl8_start_pc)
{
#ifndef X64
*pc = JMP_REL32_OPCODE; pc++;
*((int *)pc) = (int)(resume_pc - pc - 4);
pc += 4; /* 4 is the size of the relative offset */
#else
*pc = JMP_ABS_IND64_OPCODE; pc++;
*pc = JMP_ABS_MEM_IND64_MODRM; pc++;
#endif
/* We explicitly map rather than having instr_set_translation() and
* dr_fragment_app_pc() special-case this jump: longer linear search
* in the interception map, but cleaner code.
*/
if (is_in_interception_buffer(pc) && app_pc != NULL) {
ASSERT(xl8_start_pc != NULL);
map_intercept_pc_to_app_pc(xl8_start_pc, app_pc, pc - xl8_start_pc,
pc - xl8_start_pc);
}
#ifdef X64
/* 64-bit abs address is placed after the jmp instr., i.e., rip rel is 0.
* We can't place it before the jmp as in the case of the landing pad
* because there is code in the trampoline immediately preceding this jmp.
*/
*((int *)pc) = 0; pc += 4; /* 4 here is the rel offset to the lpad entry */
*((byte **)pc) = resume_pc; pc += sizeof(resume_pc);
#endif
return pc;
}
/* Redirects code at tgt_pc to jmp to our_pc, which is filled with generated
* code to call prof_func and then return to the original code.
* Assumes that the original tgt_pc should be unwritable.
* The caller is responsible for adding the generated
* code at our_pc to the dynamo/executable list(s).
*
* We assume we're being called either before any threads are created
* or while all threads are suspended, as our code-overwriting is not atomic!
* The only fix is to switch from code-overwriting to import-table modifying,
* which is more complicated, see Richter chap22 for example: and import-table
* modifying will not allow arbitrary hook placement of course, which we
* support for probes and hot patches.
*
* We guarantee to use a 5-byte jump instruction, even on x64 (PR 250294: we
* sometimes have to allocate nearby landing pads there. See PR 245169 for all
* of the possibilities for x64 hooking, all of which are either very large or
* have caveats; we decided that allocating several 64K chunks and sticking w/
* 5-byte jumps was the cleanest). It is up to the caller to ensure that we
* aren't crossing a cti target point and that displacing these 5 bytes is safe
* (though we will take care of re-relativizing the displaced code)).
*
* When cti_safe_to_ignore true, we expect to restore the code
* immediately after hitting our trampoline then we can treat the
* first 5 bytes as raw. Otherwise, we may need to PC-relativize or
* deal with conflicting hookers (case 2525). Assuming a CTI in the
* target is a good sign for hookers, we may decide to treat that
* specially based on DYNAMO_OPTION(hook_conflict) or we can give up
* and not intercept this call when abort_on_incompatible_hooker is
* true.
* FIXME: if we add one more flag we should switch to a single flag enum
*
* Currently only hotp_only uses app_code_copy_p and alt_exit_tgt_p.
* These point at their respective locations. alt_exit_tgt_p is
* currently NOT aligned for hot patching.
*
* Returns pc after last instruction of emitted interception code,
* or NULL when abort_on_incompatible_hooker is true and tgt_pc starts with a CTI.
*/
static byte *
intercept_call(byte *our_pc, byte *tgt_pc, intercept_function_t prof_func,
void *callee_arg, bool assume_xsp, after_intercept_action_t action_after,
bool abort_on_incompatible_hooker,
bool cti_safe_to_ignore,
byte **app_code_copy_p,
byte **alt_exit_tgt_p)
{
byte *pc, *our_pc_end, *lpad_start, *lpad_pc, *displaced_app_pc;
size_t size;
instrlist_t ilist;
instr_t *instr;
bool changed_prot;
dcontext_t *dcontext = get_thread_private_dcontext();
bool is_hooked = false;
bool ok;
if (dcontext == NULL)
dcontext = GLOBAL_DCONTEXT;
ASSERT(tgt_pc != NULL);
/* can't detect hookers if ignoring CTIs */
ASSERT(!abort_on_incompatible_hooker || !cti_safe_to_ignore);
/* we need 5 bytes for a jump
* find instr boundary >= 5 bytes after pc
*/
LOG(GLOBAL, LOG_ASYNCH, 3, "before intercepting:\n");
instrlist_init(&ilist);
pc = tgt_pc;
do {
app_pc next_pc;
DOLOG(3, LOG_ASYNCH, {
disassemble_with_bytes(dcontext, pc, main_logfile);
});
instr = instr_create(dcontext);
next_pc = decode_cti(dcontext, pc, instr);
ASSERT(instr_valid(instr));
instrlist_append(&ilist, instr);
/* we do not handle control transfer instructions very well here! (case 2525) */
if (instr_opcode_valid(instr) && instr_is_cti(instr)) {
/* allow for only a single cti at first instruction,
*
* unless CTIs are safe to ignore since never actually
* re-relativized (case 4086 == once-only so don't execute copy)
*/
ASSERT(!is_hooked);
ASSERT(tgt_pc == pc || cti_safe_to_ignore);
if (!cti_safe_to_ignore) {
/* we treat this as a sign of a third party hooking before us */
is_hooked = true;
}
}
pc = next_pc;
/* some of our trampolines are best effort anyways: LdrLoadDll
* shouldn't matter much, yet we like to keep it when we can
*/
if (is_hooked && abort_on_incompatible_hooker) {
SYSLOG_INTERNAL_WARNING_ONCE("giving up interception: "PFX" already hooked\n",
tgt_pc);
LOG(GLOBAL, LOG_ASYNCH, 1, "intercept_call: giving up "PFX" already hooked\n", tgt_pc);
instrlist_clear(dcontext, &ilist);
return NULL;
}
if (pc == NULL || is_hooked && DYNAMO_OPTION(hook_conflict) == HOOKED_TRAMPOLINE_DIE) {
FATAL_USAGE_ERROR(TAMPERED_NTDLL, 2, get_application_name(), get_application_pid());
}
size = (pc - tgt_pc);
} while (size < 5);
pc = our_pc;
if (is_hooked && DYNAMO_OPTION(hook_conflict) == HOOKED_TRAMPOLINE_SQUASH) {
/* squash over original with expected code, so that both
* copies we make later (one for actual execution and one for
* uninterception) have the supposedly original values
* see use in intercept_syscall_wrapper()
*/
/* FIXME: it is not easy to get the correct original bytes
* probably best solution is to read from the original
* ntdll.dll on disk. To avoid having to deal with RVA disk
* to virtual address transformations, it may be even easier
* to call LdrLoadDll with a different path to a load a
* pristine copy e.g. \\?C:\WINNT\system32\ntdll.dll
*/
/* FIXME: even if we detach we don't restore the original
* values, since what we have here should be good enough
*/
ASSERT_NOT_IMPLEMENTED(false);
}
/* Store 1st 5 bytes of original code at start of our code
* (won't be executed, original code will jump to after it)
* We do this for convenience of un-intercepting, so we don't have to
* record offset of the copy in the middle of the interception code
* CAUTION: storing the exact copy of the 5 bytes from the app image at
* the start of the trampoline is assumed in hotp_only for
* case 7279 - change only in synch.
*/
memcpy(pc, tgt_pc, 5);
pc += 5;
/* Allocate the landing pad, store its address (4 bytes in 32-bit builds
* and 8 in 64-bit ones) in the trampoline, just after the original app
* code, and emit it.
*/
lpad_start = alloc_landing_pad(tgt_pc);
memcpy(pc, &lpad_start, sizeof(lpad_start));
pc += sizeof(lpad_start);
if (alt_exit_tgt_p != NULL) {
/* XXX: if we wanted to align for hot-patching we'd do so here
* and we'd pass the (post-padding) pc here as the alternate_after
* to emit_intercept_code
*/
}
lpad_pc = lpad_start;
lpad_pc = emit_landing_pad_code(lpad_pc, pc, tgt_pc + size,
size, &displaced_app_pc, &changed_prot);
pc = emit_intercept_code(dcontext, pc, prof_func, callee_arg,
assume_xsp, assume_xsp, action_after,
(action_after ==
AFTER_INTERCEPT_TAKE_OVER_SINGLE_SHOT) ?
tgt_pc :
((alt_exit_tgt_p != NULL) ?
CURRENTLY_UNKNOWN :
NULL),
alt_exit_tgt_p);
/* If we are TAKE_OVER_SINGLE_SHOT then the handler routine has promised to
* restore the original code and supply the appropriate continuation address.
* As such there is no need for us to copy the code here as we will never use it.
* (Note not copying the code also gives us a quick fix for the Vista image entry
* problem in PR 293452 from not yet handling non-reaching cits in hook displaced
* code PR 268988). FIXME - not having a displaced copy to decode breaks the
* redirection deoode_as_bb() (but not other deocde routines) uses to hide the
* hook from the client (see PR 293465 for other reasons we need a better solution
* to that problem). */
if (action_after != AFTER_INTERCEPT_TAKE_OVER_SINGLE_SHOT) {
/* Map displaced code to original app PCs */
map_intercept_pc_to_app_pc
(displaced_app_pc, tgt_pc, size + JMP_LONG_LENGTH /* include jmp back */,
size);
/* Copy original instructions to our version, re-relativizing where necessary */
if (app_code_copy_p != NULL)
*app_code_copy_p = displaced_app_pc;
copy_app_code(dcontext, tgt_pc, displaced_app_pc, size, &ilist);
} else {
/* single shot hooks shouldn't need a copy of the app code */
ASSERT(app_code_copy_p == NULL);
}
finalize_landing_pad_code(lpad_start, changed_prot);
/* free the instrlist_t elements */
instrlist_clear(dcontext, &ilist);
if (is_hooked) {
if (DYNAMO_OPTION(hook_conflict) == HOOKED_TRAMPOLINE_CHAIN) {
/* we only have to rerelativize rel32, yet indirect
* branches can also be used by hookers, in which case we
* don't need to do anything special when copying as bytes
*/
/* FIXME: now re-relativize at target location */
ASSERT_NOT_IMPLEMENTED(false);
ASSERT_NOT_TESTED();
}
}
/* Must return to the displaced app code in the landing pad */
pc = emit_resume_jmp(pc, displaced_app_pc, tgt_pc, pc);
our_pc_end = pc;
/* Replace original code with jmp to our version (after 5-byte backup) */
/* copy-on-write will give us a copy of this page */
ok = make_hookable(tgt_pc, JMP_REL32_SIZE, &changed_prot);
if (!ok) {
/* FIXME: we fail to insert our hook but for now it is easier
* to pretend that we succeeded. */
/* should really return NULL and have callers handle this better */
return our_pc_end;
}
pc = tgt_pc;
*pc = JMP_REL32_OPCODE; pc++;
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int(lpad_pc - pc - 4)));
*((int *)pc) = (int)(ptr_int_t)(lpad_pc - pc - 4);
/* make our page unwritable now */
make_unhookable(tgt_pc, JMP_REL32_SIZE, changed_prot);
ASSERT(our_pc_end != NULL);
return our_pc_end;
}
/* Assumes that tgt_pc should be unwritable. Handles hooks with or without
* a landing pad. our_pc is the displaced app code to copy to tgt_pc.
*/
static void
un_intercept_call(byte *our_pc, byte *tgt_pc)
{
bool changed_prot;
bool ok;
byte *lpad_entry;
/* if intercept_call() has failed we shouldn't be un-intercepting */
if (our_pc == NULL)
return;
lpad_entry = (tgt_pc + JMP_REL32_SIZE) + *((int *)(tgt_pc + 1));
/* restore 1st 5 bytes of original code */
ok = make_hookable(tgt_pc, JMP_REL32_SIZE, &changed_prot);
/* if we were able to hook we can't fail on unhook */
ASSERT(ok || memcmp(tgt_pc, our_pc, JMP_REL32_SIZE) == 0 /* hook wasn't applied */);
if (!ok) {
return;
}
ASSERT(memcmp(tgt_pc, our_pc, JMP_REL32_SIZE) != 0 /* hook was applied */);
memcpy(tgt_pc, our_pc, JMP_REL32_SIZE);
make_unhookable(tgt_pc, JMP_REL32_SIZE, changed_prot);
/* Redirect the first jump in the landing pad to the hooked address (which we just
* restored above) - in case someone has chained with our hook.
*/
ok = make_hookable(lpad_entry, JMP_SIZE, &changed_prot);
ASSERT(ok);
if (ok) {
/* patch jmp to go back to target */
/* Note - not a hot_patch, caller must have synchronized already to make the
* memcpy restore above safe. */
/* FIXME: this looks wrong for x64 which uses abs jmp */
insert_relative_target(lpad_entry+1, tgt_pc, false /* not a hotpatch */);
make_unhookable(lpad_entry, JMP_SIZE, changed_prot);
}
DOLOG(3, LOG_ASYNCH, {
byte *pc = tgt_pc;
LOG(GLOBAL, LOG_ASYNCH, 3, "after un-intercepting:\n");
do {
/* Use GLOBAL_DCONTEXT here since we may have already
* called dynamo_thread_exit()
*/
pc = disassemble_with_bytes(GLOBAL_DCONTEXT, pc, main_logfile);
} while (pc < tgt_pc + JMP_REL32_SIZE);
});
unmap_intercept_pc((app_pc)tgt_pc);
}
/* Returns the syscall wrapper at nt_wrapper to a pristine (unhooked) state. Currently
* used for -clean_testalert to block the problematic injection of SpywareDoctor (9288)
* and similar apps. Returns true if syscall wrapper required cleaning */
/* FIXME - use this for our hook conflict squash policy in intercept_syscall_wrapper as
* this can handle more complicated hooks. */
static bool
clean_syscall_wrapper(byte *nt_wrapper, int sys_enum)
{
dcontext_t *dcontext = GLOBAL_DCONTEXT;
instr_t *instr_new, *instr_old = instr_create(dcontext);
instrlist_t *ilist = instrlist_create(dcontext);
app_pc pc = nt_wrapper;
bool hooked = false;
int sysnum = syscalls[sys_enum];
uint arg_bytes = syscall_argsz[sys_enum];
if (nt_wrapper == NULL || sysnum == SYSCALL_NOT_PRESENT)
goto exit_clean_syscall_wrapper;
/* syscall wrapper should look like
* For NT/2000
* mov eax, sysnum {5 bytes}
* lea edx, [esp+4] {4 bytes}
* int 2e {2 bytes}
* ret arg_bytes {1 byte (0 args) or 3 bytes}
*
* For XPsp0/XPsp1/2003sp0
* mov eax, sysnum {5 bytes}
* mov edx, VSYSCALL_ADDR {5 bytes}
* call edx {2 bytes}
* ret arg_bytes {1 byte (0 args) or 3 bytes}
*
* For XPsp2/2003sp1/Vista
* mov eax, sysnum {5 bytes}
* mov edx, VSYSCALL_ADDR {5 bytes}
* call [edx] {2 bytes}
* ret arg_bytes {1 byte (0 args) or 3 bytes}
*
* For WOW64 (case 3922), there are two types: if setting ecx to 0, xor is used.
* mov eax, sysnum {5 bytes}
* mov ecx, wow_index {5 bytes} --OR-- xor ecx,ecx {2 bytes}
* lea edx, [esp+4] {4 bytes}
* call fs:0xc0 {7 bytes}
* On Win7 WOW64 after the call we have an add:
* add esp,0x4 {3 bytes}
* ret arg_bytes {1 byte (0 args) or 3 bytes}
* On Win8 WOW64 we have no ecx (and no post-syscall add):
* 777311bc b844000100 mov eax,10044h
* 777311c1 64ff15c0000000 call dword ptr fs:[0C0h]
* 777311c8 c3 ret
*
* For win8 sysenter we have a co-located "inlined" callee:
* 77d7422c b801000000 mov eax,1
* 77d74231 e801000000 call ntdll!NtYieldExecution+0xb (77d74237)
* 77d74236 c3 ret
* 77d74237 8bd4 mov edx,esp
* 77d74239 0f34 sysenter
* 77d7423b c3 ret
* But we instead do the equivalent call to KiFastSystemCall.
*
* x64 syscall (PR 215398):
* mov r10, rcx {3 bytes}
* mov eax, sysnum {5 bytes}
* syscall {2 bytes}
* ret {1 byte}
*/
/* build correct instr list */
#define APP(list, inst) instrlist_append((list), (inst))
#ifdef X64
APP(ilist, INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_R10),
opnd_create_reg(REG_RCX)));
APP(ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_EAX),
OPND_CREATE_INT32(sysnum)));
APP(ilist, INSTR_CREATE_syscall(dcontext));
APP(ilist, INSTR_CREATE_ret(dcontext));
#else
APP(ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_EAX),
opnd_create_immed_int(sysnum, OPSZ_4)));
/* NOTE - the structure of the wrapper depends only on the OS version, not on the
* syscall method (for ex. using int on XPsp2 just changes the target on the
* vsyscall page, not the wrapper layout). */
if (get_os_version() <= WINDOWS_VERSION_2000) {
APP(ilist,
INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XDX),
opnd_create_base_disp(REG_XSP, REG_NULL, 0, 4, OPSZ_0)));
APP(ilist, INSTR_CREATE_int(dcontext, opnd_create_immed_int(0x2e, OPSZ_1)));
} else if (is_wow64_process(NT_CURRENT_PROCESS)) {
ASSERT(get_syscall_method() == SYSCALL_METHOD_WOW64);
if (syscall_uses_wow64_index()) {
ASSERT(wow64_index != NULL);
ASSERT(wow64_index[sys_enum] != SYSCALL_NOT_PRESENT);
if (wow64_index[sys_enum] == 0) {
APP(ilist, INSTR_CREATE_xor(dcontext, opnd_create_reg(REG_XCX),
opnd_create_reg(REG_XCX)));
} else {
APP(ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XCX),
OPND_CREATE_INT32(wow64_index[sys_enum])));
}
APP(ilist,
INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XDX),
opnd_create_base_disp(REG_XSP, REG_NULL, 0, 4, OPSZ_0)));
}
APP(ilist, create_syscall_instr(dcontext));
} else { /* XP or greater */
if (get_os_version() >= WINDOWS_VERSION_8) {
/* Win8 does not use ind calls: it calls to a local copy of KiFastSystemCall.
* We do the next best thing.
*/
ASSERT(KiFastSystemCall != NULL);
APP(ilist, INSTR_CREATE_call(dcontext, opnd_create_pc(KiFastSystemCall)));
} else {
APP(ilist,
INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XDX),
OPND_CREATE_INTPTR((ptr_int_t)
VSYSCALL_BOOTSTRAP_ADDR)));
if (use_ki_syscall_routines()) {
/* call through vsyscall addr to Ki*SystemCall routine */
APP(ilist,
INSTR_CREATE_call_ind(dcontext, opnd_create_base_disp
(REG_XDX, REG_NULL, 0, 0, OPSZ_4_short2)));
} else {
/* call to vsyscall addr */
APP(ilist, INSTR_CREATE_call_ind(dcontext, opnd_create_reg(REG_XDX)));
}
}
}
if (is_wow64_process(NT_CURRENT_PROCESS) && get_os_version() == WINDOWS_VERSION_7) {
APP(ilist,
INSTR_CREATE_add(dcontext, opnd_create_reg(REG_XSP), OPND_CREATE_INT8(4)));
}
if (arg_bytes == 0) {
APP(ilist, INSTR_CREATE_ret(dcontext));
} else {
APP(ilist,
INSTR_CREATE_ret_imm(dcontext, opnd_create_immed_int(arg_bytes, OPSZ_1)));
}
#endif /* X64 */
#undef APP
/* we've seen 3 different ways of hooking syscall wrappers :
* 1) jmp overwriting first 5 bytes (mov eax, sysnum), most common.
* 2) jmp overwriting second 5 bytes (certain versions of Sygate)
* 3) overwriting first 8 bytes with push eax (x3) then jmp (Spyware Doctor 9288, A^2
* anti-spyware 10414). */
/* NOTE - we could finish the walk whether hooked or not, but not much point and
* I don't fully trust are decode routine w/ junk input (if for ex. hook doesn't end
* on an instr boundary). */
for (instr_new = instrlist_first(ilist); instr_new != NULL;
instr_new = instr_get_next(instr_new)) {
instr_reset(dcontext, instr_old);
pc = decode(dcontext, pc, instr_old);
if (!instr_same(instr_new, instr_old) &&
/* don't consider call to KiFastSystemCall vs inlined sysenter to be a hook */
!(get_os_version() >= WINDOWS_VERSION_8 &&
instr_get_opcode(instr_new) == instr_get_opcode(instr_old) &&
instr_get_opcode(instr_new) == OP_call)) {
/* We haven't seen hookers where the opcode would match, so in that case
* seems likely could be our fault (got an immed wrong or something). */
ASSERT_CURIOSITY(instr_get_opcode(instr_new) != instr_get_opcode(instr_old));
/* we haven't seen any hook start deeper then the 2nd instruction */
ASSERT_CURIOSITY(instr_new == instrlist_first(ilist) ||
instr_new == instr_get_next(instrlist_first(ilist)));
hooked = true;
break;
}
}
LOG(GLOBAL, LOG_SYSCALLS, hooked ? 1U : 2U,
"Syscall wrapper @ "PFX" syscall_num=0x%03x%s hooked.\n",
nt_wrapper, sysnum, hooked ? "" : " not");
if (hooked) {
bool changed_prot;
int length = 0, encode_length;
byte *nxt_pc;
instr_t *in;
SYSLOG_INTERNAL_WARNING_ONCE("Cleaning hooked Nt wrapper @"PFX" sysnum=0x%03x",
nt_wrapper, sysnum);
for (in = instrlist_first(ilist); in != NULL; in = instr_get_next(in))
length += instr_length(dcontext, in);
DOLOG(1, LOG_SYSCALLS, {
LOG(GLOBAL, LOG_SYSCALLS, 1, "Replacing hooked wrapper :\n");
pc = nt_wrapper;
/* Note - we may disassemble junk here (if hook doesn't end on instr
* boundary) but our decode routines should handle it; is debug anyways. */
while (pc - nt_wrapper < length)
pc = disassemble_with_bytes(dcontext, pc, GLOBAL);
LOG(GLOBAL, LOG_SYSCALLS, 1, "With :\n");
instrlist_disassemble(dcontext, nt_wrapper, ilist, GLOBAL);
});
make_hookable(nt_wrapper, length, &changed_prot);
nxt_pc = instrlist_encode(dcontext, ilist, nt_wrapper,
false /* no jmp targets */);
ASSERT(nxt_pc != NULL);
encode_length = (int) (nxt_pc - nt_wrapper);
ASSERT(encode_length == length && "clean syscall encoded length mismatch");
make_unhookable(nt_wrapper, length, changed_prot);
DOLOG(1, LOG_SYSCALLS, {
LOG(GLOBAL, LOG_SYSCALLS, 1, "Cleaned wrapper is now :\n");
pc = nt_wrapper;
while (pc - nt_wrapper < length)
pc = disassemble_with_bytes(dcontext, pc, GLOBAL);
});
}
exit_clean_syscall_wrapper:
instr_destroy(dcontext, instr_old);
instrlist_clear_and_destroy(dcontext, ilist);
return hooked;
}
/* Inserts a trampoline in a system call wrapper.
* All uses should end up using dstack -- else watch out for initstack
* infinite loop (see comment above).
* Returns in skip_syscall_pc the native pc for skipping the system call altogether.
*
* Since the only safe point is the first instr, and not right at the syscall
* instr itself (no 5-byte spot there), we have to copy the whole series of app
* instrs up until the syscall instr into our buffer to be executed prior to the
* callee. This means any intercepted syscall from the cache will have that
* sequence run NATIVELY! A solution is to set a flag to go back to native
* after the next syscall, and take over right away, but a little more worrisome
* than only executing the syscall under DR in terms of potential to miss the
* re-native trigger.
*
* For x64, we still use a 5-byte jump, assuming our main heap is within 2GB of
* ntdll.dll (xref PR 215395); if not we'll need an auxiliary landing pad
* trampoline within 2GB (xref PR 250294 where we need to support such
* trampolines for general hooks). Also xref PR 245169 on x64 hooking
* possibilities, none of which is ideal.
*
* FIXME: other interception ideas: could do at instr after mov-immed,
* and arrange own int 2e for win2k, and emulate rest of sequence when
* handling syscall from handler -- this would eliminate some issues
* with the pre-syscall sequence copy, but not clear if better overall.
* Would be nice to have a single shared syscall handler, but since
* wrappers are stdcall that would be difficult.
*
* We allow the callee to execute the syscall itself, and by returning
* AFTER_INTERCEPT_LET_GO_ALT_DYN, it signals to skip the actual syscall,
* so we have control returned to the instr after the syscall instr.
* For AFTER_INTERCEPT_LET_GO or AFTER_INTERCEPT_TAKE_OVER, the syscall
* instr itself is the next instr to be executed.
*
* N.B.: this routine makes assumptions about the exact sequence of instrs in
* syscall wrappers, in particular that the indirect call to the vsyscall page
* can be turned into a direct call, which is only safe for XP SP2 if the
* vsyscall page is not writable, and cannot be made writable, which is what we
* have observed to be true.
*/
/* Helper function that returns the after-hook pc */
static byte *
syscall_wrapper_ilist(dcontext_t *dcontext,
instrlist_t *ilist, /* IN/OUT */
byte **ptgt_pc /* IN/OUT */,
void *callee_arg,
byte *fpo_stack_adjustment, /* OUT OPTIONAL */
byte **ret_pc /* OUT */,
const char *name)
{
byte *pc, *after_hook_target = NULL;
byte *after_mov_immed;
instr_t *instr, *hook_return_instr = NULL;
int opcode = OP_UNDECODED;
int sys_enum = (int)(ptr_uint_t)callee_arg;
int native_sys_num = syscalls[sys_enum];
pc = *ptgt_pc;
/* we need 5 bytes for a jump, and we assume that the first instr
* (2nd instr for x64, where we skip the 1st) is a 5-byte mov immed!
*/
instr = instr_create(dcontext);
pc = decode(dcontext, pc, instr);
after_mov_immed = pc;
/* FIXME: handle other hookers gracefully by chaining!
* Note that moving trampoline point 5 bytes in could help here (see above).
*/
#ifndef X64
ASSERT(instr_length(dcontext, instr) >= 5);
#endif
if (fpo_stack_adjustment != NULL)
*fpo_stack_adjustment = 0; /* for GBOP case 7127 */
if (instr_is_cti(instr)) {
/* we only have to rerelativize rel32, yet indirect
* branches can also be used by hookers, in which case we
* don't need to do anything special when copying as bytes
* FIXME: should we still die?
*/
/* see case 2525 for background discussion */
if (DYNAMO_OPTION(native_exec_hook_conflict) == HOOKED_TRAMPOLINE_DIE) {
/* FIXME: we could still print the message but we don't have to kill the app here */
FATAL_USAGE_ERROR(TAMPERED_NTDLL, 2, get_application_name(),
get_application_pid());
} else if (DYNAMO_OPTION(native_exec_hook_conflict) == HOOKED_TRAMPOLINE_CHAIN) {
/* we assume 5-byte hookers as well - so only need to relativize in our own copy */
/* and we need to introduce a PUSH in case of a CALL here */
ASSERT(instr_get_opcode(instr) != OP_call_ind);
if (instr_is_mbr(instr)) {
/* one can imagine mbr being used on x64 */
FATAL_USAGE_ERROR(TAMPERED_NTDLL, 2, get_application_name(),
get_application_pid());
}
if (instr_get_opcode(instr) == OP_call) {
LOG(GLOBAL, LOG_ASYNCH, 2,
"intercept_syscall_wrapper: mangling hooked call at "PFX"\n", pc);
/* replace the call w/ a push/jmp hoping this will
* eventually return to us unless the hooker decides
* to squash the system call or execute without going
* back here.
* FIXME: keep in mind the code on the instrlist is executed natively
*/
insert_push_immed_ptrsz(dcontext, (ptr_int_t)pc, ilist, NULL,
NULL, NULL);
#ifdef X64
/* check reachability from new location */
/* allow interception code to be up to a page: don't bother
* to calculate exactly where our jmp will be encoded */
if (!REL32_REACHABLE(interception_cur_pc,
opnd_get_pc(instr_get_target(instr))) ||
!REL32_REACHABLE(interception_cur_pc + PAGE_SIZE,
opnd_get_pc(instr_get_target(instr)))) {
FATAL_USAGE_ERROR(TAMPERED_NTDLL, 2, get_application_name(),
get_application_pid());
}
#endif
instrlist_append(ilist,
INSTR_CREATE_jmp(dcontext,
opnd_create_pc(opnd_get_pc(instr_get_target(instr)))));
/* skip original instruction */
instr_destroy(dcontext, instr);
/* interp still needs to be updated */
ASSERT_NOT_IMPLEMENTED(false);
} else if (instr_get_opcode(instr) == OP_jmp) {
/* FIXME - no good way to regain control after the hook */
ASSERT_NOT_IMPLEMENTED(false);
LOG(GLOBAL, LOG_ASYNCH, 2, "intercept_syscall_wrapper: hooked with jmp "PFX"\n", pc);
/* just append instruction as is */
instrlist_append(ilist, instr);
} else {
ASSERT_NOT_IMPLEMENTED(false && "unchainable CTI");
/* FIXME PR 215397: need to re-relativize pc-relative memory reference */
IF_X64(ASSERT_NOT_IMPLEMENTED(!instr_has_rel_addr_reference(instr)));
/* just append instruction as is, emit re-relativises if necessary */
instrlist_append(ilist, instr);
/* FIXME: if instr's length doesn't match normal 1st instr we'll
* get off down below: really shouldn't continue here */
}
} else if (DYNAMO_OPTION(native_exec_hook_conflict) == HOOKED_TRAMPOLINE_SQUASH) {
SYSLOG_INTERNAL_WARNING("intercept_syscall_wrapper: "
"squashing hook in %s @"PFX, name, pc);
LOG(GLOBAL, LOG_ASYNCH, 2,
"intercept_syscall_wrapper: squashing hooked syscall %s %02x at "PFX"\n",
name, native_sys_num, pc);
#ifdef X64
/* in this case we put our hook at the 1st instr */
instrlist_append(ilist,
INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_R10),
opnd_create_reg(REG_RCX)));
#endif
/* we normally ASSERT that 1st instr is always mov imm -> eax */
instrlist_append(ilist,
INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_EAX),
OPND_CREATE_INT32(native_sys_num)));
/* FIXME: even if we detach we don't restore the original
* values, since what we have here should be good enough
*/
/* skip original instruction */
instr_destroy(dcontext, instr);
} else if (DYNAMO_OPTION(native_exec_hook_conflict) == HOOKED_TRAMPOLINE_HOOK_DEEPER) {
/* move our hook one instruction deeper assuming hooker will
* return to right after the hook, verify that's an
* instruction boundary */
#ifdef X64
/* not much room for two hooks before the syscall; we don't support
* for now */
ASSERT_NOT_REACHED();
FATAL_USAGE_ERROR(TAMPERED_NTDLL, 2, get_application_name(),
get_application_pid());
#else
ASSERT(instr_length(dcontext, instr) == 5 /* length of normal mov_imm */);
*ptgt_pc = pc;
/* skip original instruction */
instr_destroy(dcontext, instr);
#endif
} else if (DYNAMO_OPTION(native_exec_hook_conflict) ==
HOOKED_TRAMPOLINE_NO_HOOK) {
SYSLOG_INTERNAL_WARNING("intercept_syscall_wrapper: "
"not hooking %s due to conflict @"PFX, name, pc);
LOG(GLOBAL, LOG_ASYNCH, 2,
"intercept_syscall_wrapper: not hooking syscall %s %02x at "PFX"\n",
name, native_sys_num, pc);
instr_destroy(dcontext, instr);
return NULL;
} else {
ASSERT_NOT_REACHED();
FATAL_USAGE_ERROR(TAMPERED_NTDLL, 2, get_application_name(),
get_application_pid());
}
} else {
#ifdef X64
/* first instr is mov rcx -> r10, which we skip to reach the 5-byte mov immed */
ASSERT(instr_get_opcode(instr) == OP_mov_ld &&
opnd_is_reg(instr_get_src(instr, 0)) &&
opnd_get_reg(instr_get_src(instr, 0)) == REG_RCX &&
opnd_is_reg(instr_get_dst(instr, 0)) &&
opnd_get_reg(instr_get_dst(instr, 0)) == REG_R10);
/* we hook after the 1st instr. will this confuse other hookers who
* will think there currently is no hook b/c not on 1st instr? */
*ptgt_pc = pc;
instr_destroy(dcontext, instr);
/* now decode the 2nd instr which should be a mov immed */
DOLOG(3, LOG_ASYNCH, { disassemble_with_bytes(dcontext, pc, main_logfile); });
instr = instr_create(dcontext);
pc = decode(dcontext, pc, instr);
ASSERT(instr_length(dcontext, instr) == 5 /* length of normal mov_imm */);
opcode = instr_get_opcode(instr);
/* now fall through */
#endif
/* normally a mov eax, native_sys_num */
ASSERT(instr_get_opcode(instr) == OP_mov_imm);
ASSERT(opnd_get_immed_int(instr_get_src(instr, 0)) == native_sys_num);
LOG(GLOBAL, LOG_ASYNCH, 3, "intercept_syscall_wrapper: hooked syscall %02x at "PFX"\n",
native_sys_num, pc);
/* append instruction (non-CTI) */
instrlist_append(ilist, instr);
}
#ifdef X64
/* 3rd instr: syscall */
instr = instr_create(dcontext);
after_hook_target = pc;
pc = decode(dcontext, pc, instr);
*ret_pc = pc;
ASSERT(instr_get_opcode(instr) == OP_syscall);
instr_destroy(dcontext, instr);
#else
if (get_syscall_method() == SYSCALL_METHOD_WOW64 &&
get_os_version() >= WINDOWS_VERSION_8) {
ASSERT(!syscall_uses_wow64_index());
/* second instr is a call*, what we consider the system call instr */
after_hook_target = pc;
instr = instr_create(dcontext);
*ret_pc = decode(dcontext, pc, instr); /* skip call* to skip syscall */
ASSERT(instr_get_opcode(instr) == OP_call_ind);
instr_destroy(dcontext, instr);
/* XXX: how handle chrome hooks on win8? (xref i#464) */
} else if (get_syscall_method() == SYSCALL_METHOD_SYSENTER &&
get_os_version() >= WINDOWS_VERSION_8) {
/* Second instr is a call to an inlined routine that calls sysenter.
* We treat this in a similar way to call* to sysenter which is handled
* down below.
* XXX: could share a little bit of code but not much.
*/
after_hook_target = pc;
instr = instr_create(dcontext);
*ret_pc = decode(dcontext, pc, instr); /* skip call to skip syscall */
ASSERT(instr_get_opcode(instr) == OP_call);
/* replace the call w/ a push */
instrlist_append(ilist, INSTR_CREATE_push_imm
(dcontext, OPND_CREATE_INTPTR((ptr_int_t)*ret_pc)));
/* the callee, inlined later in wrapper, or KiFastSystemCall */
pc = (byte *) opnd_get_pc(instr_get_target(instr));
/* fourth instr: mov %xsp -> %xdx */
instr_reset(dcontext, instr); /* re-use call container */
pc = decode(dcontext, pc, instr);
instrlist_append(ilist, instr);
ASSERT(instr_get_opcode(instr) == OP_mov_ld);
/* fifth instr: sysenter */
instr = instr_create(dcontext);
after_hook_target = pc;
pc = decode(dcontext, pc, instr);
ASSERT(instr_get_opcode(instr) == OP_sysenter);
instr_destroy(dcontext, instr);
/* ignore ret after sysenter, we'll return to ret after call */
} else {
/* second instr is either a lea, a mov immed, or an xor */
DOLOG(3, LOG_ASYNCH, { disassemble_with_bytes(dcontext, pc, main_logfile); });
instr = instr_create(dcontext);
pc = decode(dcontext, pc, instr);
instrlist_append(ilist, instr);
opcode = instr_get_opcode(instr);
}
if (after_hook_target != NULL) {
/* all set */
} else if (get_syscall_method() == SYSCALL_METHOD_WOW64) {
ASSERT(opcode == OP_xor || opcode == OP_mov_imm);
/* third instr is a lea */
instr = instr_create(dcontext);
pc = decode(dcontext, pc, instr);
if (instr_get_opcode(instr) == OP_jmp_ind) {
/* Handle chrome hooks (i#464) via targeted handling since these
* don't look like any other hooks we've seen. We can generalize if
* we later find similar-looking hooks elsewhere.
* They look like this:
* ntdll!NtMapViewOfSection:
* 77aafbe0 b825000000 mov eax,0x25
* 77aafbe5 ba28030a00 mov edx,0xa0328
* 77aafbea ffe2 jmp edx
* 77aafbec c215c0 ret 0xc015
* 77aafbef 90 nop
* 77aafbf0 0000 add [eax],al
* 77aafbf2 83c404 add esp,0x4
* 77aafbf5 c22800 ret 0x28
* We put in the native instrs in our hook so our stuff
* operates correctly, and assume the native state change
* won't affect the chrome hook code. We resume
* right after the 1st mov-imm-eax instr. These are the native
* instrs for all chrome hooks in ntdll (Nt{,Un}MapViewOfSection),
* which are put in place from the parent, so they're there when we
* initialize and aren't affected by -handle_ntdll_modify:
* 77aafbe5 33c9 xor ecx,ecx
* 77aafbe7 8d542404 lea edx,[esp+0x4]
*/
instr_t *tmp = instrlist_last(ilist);
instrlist_remove(ilist, tmp);
instr_destroy(dcontext, tmp);
instr_destroy(dcontext, instr);
ASSERT(syscall_uses_wow64_index()); /* else handled above */
ASSERT(wow64_index != NULL);
if (wow64_index[sys_enum] == 0) {
instrlist_append
(ilist, INSTR_CREATE_xor
(dcontext, opnd_create_reg(REG_XCX), opnd_create_reg(REG_XCX)));
} else {
instrlist_append
(ilist, INSTR_CREATE_mov_imm
(dcontext, opnd_create_reg(REG_XCX),
OPND_CREATE_INT32(wow64_index[sys_enum])));
}
instrlist_append
(ilist, INSTR_CREATE_lea
(dcontext, opnd_create_reg(REG_XDX),
opnd_create_base_disp(REG_XSP, REG_NULL, 0, 0x4, OPSZ_lea)));
after_hook_target = after_mov_immed;
/* skip chrome hook to skip syscall: target "add esp,0x4" */
# define CHROME_HOOK_DISTANCE_JMP_TO_SKIP 6
*ret_pc = pc + CHROME_HOOK_DISTANCE_JMP_TO_SKIP;
DOCHECK(1, {
instr = instr_create(dcontext);
decode(dcontext, *ret_pc, instr);
ASSERT(instr_get_opcode(instr) == OP_add);
instr_destroy(dcontext, instr);
});
} else {
ASSERT(instr_get_opcode(instr) == OP_lea);
instrlist_append(ilist, instr);
/* fourth instr is a call*, what we consider the system call instr */
after_hook_target = pc;
instr = instr_create(dcontext);
*ret_pc = decode(dcontext, pc, instr); /* skip call* to skip syscall */
ASSERT(instr_get_opcode(instr) == OP_call_ind);
instr_destroy(dcontext, instr);
}
} else if (opcode == OP_mov_imm) {
ptr_int_t immed = opnd_get_immed_int(instr_get_src(instr, 0));
ASSERT(PAGE_START(immed) == (ptr_uint_t) VSYSCALL_PAGE_START_BOOTSTRAP_VALUE);
ASSERT(get_syscall_method() == SYSCALL_METHOD_SYSENTER);
ASSERT(get_os_version() >= WINDOWS_VERSION_XP);
/* third instr is an indirect call */
instr = instr_create(dcontext);
pc = decode(dcontext, pc, instr);
*ret_pc = pc;
ASSERT(instr_get_opcode(instr) == OP_call_ind);
if (fpo_stack_adjustment != NULL) {
/* for GBOP case 7127 */
*fpo_stack_adjustment = 4;
}
/* replace the call w/ a push */
instrlist_append(ilist, INSTR_CREATE_push_imm
(dcontext, OPND_CREATE_INTPTR((ptr_int_t)pc)));
/* the callee, either on vsyscall page or at KiFastSystemCall */
if (opnd_is_reg(instr_get_src(instr, 0)))
pc = (byte *) immed;
else /* KiFastSystemCall */
pc = *((byte **)immed);
/* fourth instr: mov %xsp -> %xdx */
instr_reset(dcontext, instr); /* re-use ind call container */
pc = decode(dcontext, pc, instr);
instrlist_append(ilist, instr);
ASSERT(instr_get_opcode(instr) == OP_mov_ld);
/* fifth instr: sysenter */
instr = instr_create(dcontext);
after_hook_target = pc;
pc = decode(dcontext, pc, instr);
ASSERT(instr_get_opcode(instr) == OP_sysenter);
instr_destroy(dcontext, instr);
/* ignore ret after sysenter, we'll return to ret after call */
} else {
ASSERT(opcode == OP_lea);
/* third instr: int 2e */
instr = instr_create(dcontext);
*ret_pc = decode(dcontext, pc, instr);
ASSERT(instr_get_opcode(instr) == OP_int);
/* if we hooked deeper, will need to hook over the int too */
if (pc - *ptgt_pc < 5 /* length of our hook */) {
/* Need to add an int 2e to the return path since hook clobbered
* the original one. We use create_syscall_instr(dcontext) for
* the sygate int fix. FIXME - the pc will now show up as
* after_do/share_syscall() but should be ok since anyone
* checking for those on this thread should have already checked
* for it being native. */
hook_return_instr = create_syscall_instr(dcontext);
after_hook_target = *ret_pc;
ASSERT(DYNAMO_OPTION(native_exec_hook_conflict) ==
HOOKED_TRAMPOLINE_HOOK_DEEPER);
} else {
/* point after_hook_target to int 2e */
after_hook_target = pc;
}
instr_destroy(dcontext, instr);
}
#endif
return after_hook_target;
}
byte *
intercept_syscall_wrapper(byte **ptgt_pc /* IN/OUT */,
intercept_function_t prof_func,
void *callee_arg, after_intercept_action_t action_after,
app_pc *skip_syscall_pc /* OUT */,
byte **orig_bytes_pc /* OUT */,
byte *fpo_stack_adjustment /* OUT OPTIONAL */,
const char *name)
{
byte *pc, *emit_pc, *ret_pc = NULL, *after_hook_target = NULL, *tgt_pc;
byte *lpad_start, *lpad_pc, *lpad_resume_pc, *xl8_start_pc;
instr_t *instr, *hook_return_instr = NULL;
instrlist_t ilist;
bool changed_prot;
dcontext_t *dcontext = get_thread_private_dcontext();
bool ok;
if (dcontext == NULL)
dcontext = GLOBAL_DCONTEXT;
instrlist_init(&ilist);
ASSERT(ptgt_pc != NULL && *ptgt_pc != NULL);
after_hook_target = syscall_wrapper_ilist(dcontext, &ilist, ptgt_pc, callee_arg,
fpo_stack_adjustment, &ret_pc, name);
if (after_hook_target == NULL)
return NULL; /* aborted */
tgt_pc = *ptgt_pc;
pc = tgt_pc;
LOG(GLOBAL, LOG_ASYNCH, 3, "%s: before intercepting:\n", __FUNCTION__);
DOLOG(3, LOG_ASYNCH, { disassemble_with_bytes(dcontext, pc, main_logfile); });
pc = interception_cur_pc; /* current spot in interception buffer */
/* copy original 5 bytes to ease unhooking, we won't execute this */
*orig_bytes_pc = pc;
memcpy(pc, tgt_pc, 5);
pc += 5;
/* i#901: We need a landing pad b/c ntdll may not be reachable from DR.
* However, we do not support rip-rel instrs in the syscall wrapper, as by
* keeping the displaced app code in the intercept buffer and not in the
* landing pad we can use the standard landing pad layout, the existing
* emit_landing_pad_code(), the existing is_syscall_trampoline(), and other
* routines, and also keeps the landing pads themselves a constant size and
* layout (though the ones here do not have all their space used b/c there's
* no displaced app code).
*/
lpad_start = alloc_landing_pad(tgt_pc);
lpad_pc = lpad_start;
lpad_pc = emit_landing_pad_code(lpad_pc, pc, after_hook_target,
0/*no displaced code in lpad*/,
&lpad_resume_pc, &changed_prot);
/* i#1027: map jmp back in landing pad to original app pc. We do this to
* have the translation just in case, even though we hide this jmp from the
* client. Xref the PR 219351 comment in is_intercepted_app_pc().
*/
map_intercept_pc_to_app_pc(lpad_resume_pc, after_hook_target, JMP_LONG_LENGTH, 0);
finalize_landing_pad_code(lpad_start, changed_prot);
emit_pc = pc;
/* we assume that interception buffer is still writable */
/* we need to enter at copy of pre-syscall sequence, since we need
* callee to be at app state exactly prior to syscall instr itself.
* this means this sequence is executed natively even for syscalls
* in the cache (since interception code is run natively) -- only
* worry would be stack faults, whose context we might xlate incorrectly
*
* N.B.: bb_process_ubr() assumes that the target of the trampoline
* is the original mov immed!
*/
/* insert our copy of app instrs leading up to syscall
* first instr doubles as the clobbered original code for un-intercepting.
*/
for (instr = instrlist_first(&ilist); instr != NULL; instr = instr_get_next(instr)) {
pc = instr_encode(dcontext, instr, pc);
ASSERT(pc != NULL);
}
instrlist_clear(dcontext, &ilist);
pc = emit_intercept_code(dcontext, pc, prof_func, callee_arg,
false /*do not assume xsp*/,
false /*not known to not be on dstack: ok to clobber flags*/,
action_after,
ret_pc /* alternate target to skip syscall */, NULL);
/* Map interception buffer PCs to original app PCs */
if (is_in_interception_buffer(pc))
map_intercept_pc_to_app_pc(pc, tgt_pc, 10 /* 5 bytes + jmp back */, 5);
/* The normal target, for really doing the system call native, used
* for letting go normally and for take over.
* We already did pre-syscall sequence, so we go straight to syscall itself.
*/
/* have to include syscall instr here if we ended up hooking over it */
xl8_start_pc = pc;
if (hook_return_instr != NULL) {
pc = instr_encode(dcontext, hook_return_instr, pc);
ASSERT(pc != NULL);
instr_destroy(dcontext, hook_return_instr);
}
pc = emit_resume_jmp(pc, lpad_resume_pc, tgt_pc, xl8_start_pc);
/* update interception buffer pc */
interception_cur_pc = pc;
/* Replace original code with jmp to our version's entrance */
/* copy-on-write will give us a copy of this page */
ok = make_hookable(tgt_pc, 5, &changed_prot);
if (ok) {
ptr_int_t offset = (lpad_pc - (tgt_pc + 5));
#ifdef X64
if (!REL32_REACHABLE_OFFS(offset)) {
ASSERT_NOT_IMPLEMENTED(false && "PR 245169: hook target too far: NYI");
/* FIXME PR 245169: we need use landing_pad_areas to alloc landing
* pads to trampolines, as done for PR 250294.
*/
}
#endif
pc = tgt_pc;
*pc = JMP_REL32_OPCODE;
pc++;
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int(offset)));
*((int *)pc) = (int) offset;
}
/* make our page unwritable now */
make_unhookable(tgt_pc, 5, changed_prot);
if (skip_syscall_pc != NULL)
*skip_syscall_pc = ret_pc;
return emit_pc;
}
/* two convenience routines for intercepting using the code[] buffer
* after the initialization routine has completed
*
* WARNING: only call this when there is only one thread going!
* This is not thread-safe!
*/
byte *
insert_trampoline(byte *tgt_pc, intercept_function_t prof_func,
void *callee_arg, bool assume_xsp, after_intercept_action_t action_after,
bool cti_safe_to_ignore)
{
byte *pc = interception_cur_pc;
/* make interception code writable, NOTE the interception code may
* be in vmareas executable list, we make the interception code temporarily
* writable here without removing or flushing the region, this is ok since
* we should be single threaded when this function is called and we never
* overwrite existing interception code */
DEBUG_DECLARE(bool ok =)
make_writable(interception_code, INTERCEPTION_CODE_SIZE);
ASSERT(ok);
/* FIXME: worry about inserting trampoline across bb boundaries? */
interception_cur_pc = intercept_call(interception_cur_pc, tgt_pc, prof_func, callee_arg,
assume_xsp, action_after,
false, /* need the trampoline at all costs */
cti_safe_to_ignore, NULL, NULL);
/* FIXME: we assume early intercept_call failures are ok to
* ignore. Note we may want to crash instead if trying to sandbox
* malicious programs that may be able to prevent us from
* committing memory.
*/
ASSERT(interception_cur_pc - interception_code < INTERCEPTION_CODE_SIZE);
/* return interception code to read only state */
make_unwritable(interception_code, INTERCEPTION_CODE_SIZE);
return pc;
}
void
remove_trampoline(byte *our_pc, byte *tgt_pc)
{
un_intercept_call(our_pc, tgt_pc);
}
bool
is_in_interception_buffer(byte *pc)
{
return (pc >= interception_code &&
pc < interception_code + INTERCEPTION_CODE_SIZE);
}
bool
is_part_of_interception(byte *pc)
{
return (is_in_interception_buffer(pc) ||
vmvector_overlap(landing_pad_areas, pc, pc + 1));
}
bool
is_on_interception_initial_route(byte *pc)
{
if (vmvector_overlap(landing_pad_areas, pc, pc + 1)) {
/* Look for the forward jump. For x64, any ind jmp will do, as reverse
* jmp is direct.
*/
if (IF_X64_ELSE(*pc == JMP_ABS_IND64_OPCODE &&
*(pc + 1) == JMP_ABS_MEM_IND64_MODRM,
*pc == JMP_REL32_OPCODE &&
is_in_interception_buffer(PC_RELATIVE_TARGET(pc + 1)))) {
return true;
}
}
return false;
}
bool
is_syscall_trampoline(byte *pc, byte **tgt)
{
if (syscall_trampolines_start == NULL)
return false;
if (vmvector_overlap(landing_pad_areas, pc, pc + 1)) {
/* Also count the jmp from landing pad back to syscall instr, which is
* immediately after the jmp from landing pad to interception buffer (i#1027).
*/
app_pc syscall;
if (is_jmp_rel32(pc, pc, &syscall) &&
is_jmp_rel32(pc - JMP_LONG_LENGTH, NULL, NULL)) {
dcontext_t *dcontext = get_thread_private_dcontext();
instr_t instr;
if (dcontext == NULL)
dcontext = GLOBAL_DCONTEXT;
instr_init(dcontext, &instr);
decode(dcontext, syscall, &instr);
if (instr_is_syscall(&instr)) {
/* proceed using the 1st jmp */
pc -= JMP_LONG_LENGTH;
}
instr_free(dcontext, &instr);
}
#ifdef X64
/* target is 8 bytes back */
pc = *(app_pc *)(pc - sizeof(app_pc));
#else
if (!is_jmp_rel32(pc, pc, &pc))
return false;
#endif
}
if (pc >= syscall_trampolines_start && pc < syscall_trampolines_end) {
if (tgt != NULL)
*tgt = pc;
return true;
}
return false;
}
/****************************************************************************
*/
/* TRACK_NTDLL: try to find where kernel re-emerges into user mode when it
* dives into kernel mode
*/
#if TRACK_NTDLL
static byte *
make_writable_incr(byte *pc)
{
PBYTE pb = (PBYTE) pc;
MEMORY_BASIC_INFORMATION mbi;
DWORD old_prot;
int res;
res = query_virtual_memory(pb, &mbi, sizeof(mbi));
ASSERT(res == sizeof(mbi));
res = protect_virtual_memory(mbi.BaseAddress, mbi.RegionSize,
PAGE_EXECUTE_WRITECOPY, &old_prot);
ASSERT(res);
return (byte *)((int)mbi.BaseAddress + (int)mbi.RegionSize);
}
static byte *
make_inaccessible(byte *pc)
{
PBYTE pb = (PBYTE) pc;
MEMORY_BASIC_INFORMATION mbi;
DWORD old_prot;
int res;
res = query_virtual_memory(pb, &mbi, sizeof(mbi));
ASSERT(res == sizeof(mbi));
res = protect_virtual_memory(mbi.BaseAddress, mbi.RegionSize,
PAGE_NOACCESS, &old_prot);
ASSERT(res);
return (byte *)((int)mbi.BaseAddress + (int)mbi.RegionSize);
}
void
wipe_out_ntdll()
{
byte * start = (byte *) 0x77F81000;
byte * stop = (byte *) 0x77FCD95B;
byte *pc;
/* first suspend all other threads */
thread_record_t **threads;
int i, num_threads;
mutex_lock(&thread_initexit_lock);
get_list_of_threads(&threads, &num_threads);
for (i=0; i<num_threads; i++) {
if (threads[i]->id != get_thread_id()) {
LOG(GLOBAL, LOG_ASYNCH, 1, "Suspending thread "TIDFMT" == "PFX"\n",
tr->id, tr->handle);
SuspendThread(threads[i]->handle);
}
}
mutex_unlock(&thread_initexit_lock);
global_heap_free(threads, num_threads*sizeof(thread_record_t*)
HEAPACCT(ACCT_THREAD_MGT));
LOG(GLOBAL, LOG_ASYNCH, 1, "INVALIDATING ENTIRE NTDLL.DLL!!!\n");
pc = start;
while (pc < stop) {
LOG(GLOBAL, LOG_ASYNCH, 1, "\t"PFX"\n", pc);
#if 0
pc = make_inaccessible(pc);
#else
pc = make_writable_incr(pc);
#endif
}
#if 1
for (pc=start; pc<stop; pc++) {
*pc = 0xcc;
}
#endif
}
#endif /* TRACK_NTDLL */
/*
****************************************************************************/
/* If we receive an asynch event while we've lost control but before we
* reach the image entry point or our other retakeover points we should
* retakeover, to minimize the amount of code run natively -- these should
* be rare during init and perf hit of repeated flushing and re-walking
* memory list shouldn't be an issue.
* Separated from asynch_take_over to not force its callers to do this.
*/
static inline void
asynch_retakeover_if_native()
{
thread_record_t *tr = thread_lookup(get_thread_id());
ASSERT(tr != NULL);
if (IS_UNDER_DYN_HACK(tr->under_dynamo_control)) {
ASSERT(!reached_image_entry_yet());
/* must do a complete takeover-after-native */
retakeover_after_native(tr, INTERCEPT_EARLY_ASYNCH);
}
}
/* This routine is called by a DynamoRIO routine that was invoked natively,
* i.e., not under DynamoRIO control.
* This routine takes control using the application state in its arguments,
* and starts execution under DynamoRIO at start_pc.
* state->callee_arg is a boolean "save_dcontext":
* If save_dcontext is true, it saves the cur dcontext on the callback stack
* of dcontexts and proceeds to execute with a new dcontext.
* Otherwise, it uses the current dcontext, which has its trace squashed.
*/
static void
asynch_take_over(app_state_at_intercept_t *state)
{
dcontext_t *dcontext;
bool save_dcontext = (bool)(ptr_uint_t) state->callee_arg;
if (save_dcontext) {
/* save cur dcontext and get a new one */
dcontext = callback_setup(state->start_pc);
} else {
dcontext = get_thread_private_dcontext();
ASSERT(dcontext->initialized);
/* case 9347 we want to let go after image entry point */
if (RUNNING_WITHOUT_CODE_CACHE()
&& dcontext->next_tag == BACK_TO_NATIVE_AFTER_SYSCALL
&& state->start_pc == image_entry_pc) {
ASSERT(dcontext->native_exec_postsyscall == image_entry_pc);
} else {
ASSERT(!RUNNING_WITHOUT_CODE_CACHE());
dcontext->next_tag = state->start_pc;
}
/* if we were building a trace, kill it */
if (is_building_trace(dcontext)) {
LOG(THREAD, LOG_ASYNCH, 2, "asynch_take_over: squashing old trace\n");
trace_abort(dcontext);
}
}
ASSERT(os_using_app_state(dcontext));
LOG(THREAD, LOG_ASYNCH, 2, "asynch_take_over 0x%08x\n", state->start_pc);
/* may have been inside syscall...now we're in app! */
set_at_syscall(dcontext, false);
/* tell dispatch() why we're coming there */
if (dcontext->whereami != WHERE_APP) /* new thread, typically: leave it that way */
dcontext->whereami = WHERE_TRAMPOLINE;
set_last_exit(dcontext, (linkstub_t *) get_asynch_linkstub());
transfer_to_dispatch(dcontext, &state->mc, false/*!full_DR_state*/);
ASSERT_NOT_REACHED();
}
bool
new_thread_is_waiting_for_dr_init(thread_id_t tid, app_pc pc)
{
uint i;
/* i#1443c#4: check for a thread that's about to hit our hook */
if (pc == LdrInitializeThunk || pc == (app_pc)KiUserApcDispatcher)
return true;
/* We check until the max to avoid races on threads_waiting_count */
for (i = 0; i < MAX_THREADS_WAITING_FOR_DR_INIT; i++) {
if (threads_waiting_for_dr_init[i] == tid)
return true;
}
return false;
}
static void
possible_new_thread_wait_for_dr_init(CONTEXT *cxt)
{
/* Because of problems with injected threads while we are initializing
* (case 5167, 5020, 5103 bunch of others) we block here while the main
* thread finishes initializing. Once dynamo_exited is set is safe to
* let the thread continue since dynamo_thread_init will imediately
* return. */
uint idx;
#ifdef CLIENT_SIDELINE
/* We allow a client init routine to create client threads: DR is
* initialized enough by now
*/
if (((void *)cxt->CXT_XIP == (void *)client_thread_target))
return;
#endif
if (dynamo_initialized || dynamo_exited)
return;
/* i#1443: communicate with os_take_over_all_unknown_threads() */
idx = atomic_add_exchange_int((volatile int *)&threads_waiting_count, 1);
idx--; /* -1 to get index from count */
ASSERT(idx < MAX_THREADS_WAITING_FOR_DR_INIT);
if (idx >= MAX_THREADS_WAITING_FOR_DR_INIT) {
/* What can we do? We'll have to risk it and hope this thread is scheduled
* and initializes before os_take_over_all_unknown_threads() runs.
*/
} else {
threads_waiting_for_dr_init[idx] = get_thread_id();
}
while (!dynamo_initialized && !dynamo_exited) {
STATS_INC(apc_yields_while_initializing);
os_thread_yield();
}
if (idx < MAX_THREADS_WAITING_FOR_DR_INIT) {
/* os_take_over_all_unknown_threads()'s context check will work from here */
threads_waiting_for_dr_init[idx] = INVALID_THREAD_ID;
}
}
/* returns true if intercept function should return immediately and let go,
* false if intercept function should continue processing and maybe takeover */
static bool
intercept_new_thread(CONTEXT *cxt)
{
#ifdef CLIENT_INTERFACE
bool is_client = false;
#endif
byte *dstack = NULL;
priv_mcontext_t mc;
/* init apc, check init_apc_go_native to sync w/detach */
if (init_apc_go_native) {
/* need to wait after checking _go_native to avoid a thread
* going native to early because of races between setting
* _go_native and _pause */
if (init_apc_go_native_pause) {
/* FIXME : this along with any other logging in this
* method could potentially be race condition with detach
* cleanup, though is unlikely */
LOG(GLOBAL, LOG_ALL, 2,
"Thread waiting at init_apc for detach to finish\n");
}
while (init_apc_go_native_pause) {
os_thread_yield();
}
/* just return, FIXME : see concerns in detach_helper about
* getting to native code before the interception_code is
* freed and getting out of here before the dll is unloaded
*/
# if 0 /* this is not a dynamo controlled thread! */
SELF_PROTECT_LOCAL(get_thread_private_dcontext(), READONLY);
# endif
return true /* exit intercept function and let go */;
}
/* should keep in sync with changes in intercept_image_entry() for
* thread initialization
*/
/* initialize thread now */
#ifdef CLIENT_SIDELINE
/* i#41/PR 222812: client threads target a certain routine and always
* directly never via win API (so we don't check THREAT_START_ADDR)
*/
is_client = ((void *)cxt->CXT_XIP == (void *)client_thread_target);
if (is_client) {
/* client threads start out on dstack */
GET_STACK_PTR(dstack);
ASSERT(is_dynamo_address(dstack));
/* we assume that less than a page will have been used */
dstack = (byte *) ALIGN_FORWARD(dstack, PAGE_SIZE);
}
#endif
context_to_mcontext(&mc, cxt);
if (dynamo_thread_init(dstack, &mc _IF_CLIENT_INTERFACE(is_client)) != -1) {
app_pc thunk_xip = (app_pc)cxt->CXT_XIP;
dcontext_t *dcontext = get_thread_private_dcontext();
LOG_DECLARE(char sym_buf[MAXIMUM_SYMBOL_LENGTH];)
bool is_nudge_thread = false;
#ifdef CLIENT_SIDELINE
if (is_client) {
ASSERT(is_on_dstack(dcontext, (byte *)cxt->CXT_XSP));
/* PR 210591: hide our threads from DllMain by not executing rest
* of Ldr init code and going straight to target. create_thread()
* already set up the arg in cxt.
*/
nt_continue(cxt);
ASSERT_NOT_REACHED();
}
#endif
/* Xref case 552, to ameliorate the risk of an attacker
* leveraging our detach routines etc. against us, we detect
* an incoming nudge thread here during thread init and set
* a dcontext flag that the nudge routines can later verify.
* Attacker could still bypass if can control the start addr
* of a new thread (FIXME). We check both Xax and Xip since
* nodemgr has the ability to target directly or send through
* kernel32 start thunk (though only start thunk, i.e. xax,
* is currently used). If we move to just directly targeted,
* i.e. xip, would be a lot harder for the attacker since
* the documented API routines all hardcode that value.
*
* The nudge related checks below were moved above thread_policy checks
* because there is no dependency and because process control nudge for
* thin_client needs it; part of cases 8884, 8594 & 8888. */
ASSERT(dcontext != NULL && dcontext->nudge_target == NULL);
if ((void *)cxt->CXT_XIP == (void *)generic_nudge_target ||
(void *)cxt->THREAD_START_ADDR == (void *)generic_nudge_target) {
LOG(THREAD, LOG_ALL, 1, "Thread targeting nudge.\n");
if (dcontext != NULL) {
dcontext->nudge_target = (void *)generic_nudge_target;
}
is_nudge_thread = true;
}
/* FIXME: temporary fix for case 9467 - mute nudges for cygwin apps.
* Long term fix is to make nudge threads go directly to their targets.
*/
if (is_nudge_thread && DYNAMO_OPTION(thin_client) && DYNAMO_OPTION(mute_nudge)) {
TRY_EXCEPT(dcontext, { /* to prevent crashes when walking the ldr list */
PEB *peb = get_own_peb();
PEB_LDR_DATA *ldr = peb->LoaderData;
LIST_ENTRY *e;
LIST_ENTRY *start = &ldr->InLoadOrderModuleList;
LDR_MODULE *mod;
uint traversed = 0;
/* Note: this loader module list walk is racy with the loader;
* can't really grab the loader lock here. Shouldn't be a big
* problem as this is a temp fix anyway. */
for (e = start->Flink; e != start; e = e->Flink) {
mod = (LDR_MODULE *) e;
if (wcsstr(mod->BaseDllName.Buffer, L"cygwin1.dll") != NULL) {
os_terminate(dcontext, TERMINATE_THREAD|TERMINATE_CLEANUP);
ASSERT_NOT_REACHED();
}
if (traversed++ > MAX_MODULE_LIST_INFINITE_LOOP_THRESHOLD) {
SYSLOG_INTERNAL_WARNING("nudge muting: too many modules");
break;
}
}
}, { /* do nothing */ });
}
/* For thin_client, let go right after we init the thread, i.e., create
* the dcontext; don't do the thread policy stuff, that requires locks
* that aren't initialized in this mode! */
if (DYNAMO_OPTION(thin_client))
return true /* exit intercept function and let go */;
/* In fact the apc_target is ntdll!LdrInitializeThunk
* (for all threads not only the first one).
* Note for vista that threads do not start with an apc, but rather
* directly show up at ntdll!LdrInitializeThunk (which we hook on
* vista to call this routine). Note that the thunk will return via
* an NtContinue to a context on the stack so really we see the same
* behavior as before except we don't go through the apc dispatcher.
*
* For threads created by kernel32!CreateRemoteThread pre vista
* the cxt->Xip then is kernel32!Base{Process,Thread}StartThunk (not exported),
* while the cxt->Xax is the user thread procedure and cxt->Xbx is the arg.
* On vista it's the same except cxt->Xip is set to ntdll!RtlUserThreadStart
* (which is exported in ntdll.dll) by the kernel.
*
* kernel32!BaseProcessStartThunk, or kernel32!BaseThreadStartThunk
* on all versions I've tested start with
* 0xed33 xor ebp,ebp
*
* Note, of course, that direct NtCreateThread calls
* can go anywhere they want (including on Vista). For example toolhelp
* uses NTDLL!RtlpQueryProcessDebugInformationRemote
* as the xip so shouldn't count much on this. NtCreateThreadEx threads
* (vista only) will, however, always have xip=ntdll!RtlUserThreadStart
* since the kernel sets that.
*/
/* keep in mind this is a 16-bit match */
#define BASE_THREAD_START_THUNK_USHORT 0xed33
/* see comments in os.c pre_system_call CreateThread, Xax holds
* the win32 start address (Nebbett), Xbx holds the argument
* (observation). Same appears to hold for CreateThreadEx. */
/* Note that the initial thread won't log here */
LOG(THREAD_GET, LOG_THREADS, 1,
"New Thread : Win32 start address "PFX" arg "PFX", thunk xip="PFX"\n",
cxt->THREAD_START_ADDR, cxt->THREAD_START_ARG, cxt->CXT_XIP);
DOLOG(1, LOG_THREADS, {
print_symbolic_address((app_pc)cxt->THREAD_START_ADDR, sym_buf, sizeof(sym_buf),
false);
LOG(THREAD_GET, LOG_THREADS, 1,
"Symbol information for start address %s\n", sym_buf);
});
DOLOG(2, LOG_THREADS, {
print_symbolic_address((app_pc)cxt->CXT_XIP, sym_buf, sizeof(sym_buf),
false);
LOG(THREAD_GET, LOG_THREADS, 2,
"Symbol information for thunk address %s\n", sym_buf);
});
/* start address should be set at thread initialization */
if (dcontext->win32_start_addr == (app_pc)cxt->THREAD_START_ARG) {
/* case 10965/PR 215400: WOW64 & x64 query returns arg for some reason */
#ifndef X64
ASSERT(is_wow64_process(NT_CURRENT_PROCESS));
#endif
dcontext->win32_start_addr = (app_pc)cxt->THREAD_START_ADDR;
}