| /* ********************************************************** |
| * Copyright (c) 2010-2014 Google, Inc. All rights reserved. |
| * Copyright (c) 2002-2010 VMware, Inc. All rights reserved. |
| * **********************************************************/ |
| |
| /* |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * * Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * * Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * * Neither the name of VMware, Inc. nor the names of its contributors may be |
| * used to endorse or promote products derived from this software without |
| * specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE |
| * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH |
| * DAMAGE. |
| */ |
| |
| /* Copyright (c) 2003-2007 Determina Corp. */ |
| /* Copyright (c) 2002-2003 Massachusetts Institute of Technology */ |
| /* Copyright (c) 2002 Hewlett-Packard Company */ |
| |
| /* |
| * callback.c - windows-specific callback, APC, and exception handling routines |
| */ |
| |
| /* This whole file assumes x86 */ |
| #include "configure.h" |
| #ifndef X86 |
| #error X86 must be defined |
| #endif |
| |
| #include "../globals.h" |
| #include "arch.h" |
| #include "instr.h" |
| #include "decode.h" |
| #include "../monitor.h" |
| #include "../fcache.h" |
| #include "../fragment.h" |
| #include "decode_fast.h" |
| #include "disassemble.h" |
| #include "instr_create.h" |
| #include "ntdll.h" |
| #include "events.h" |
| #include "os_private.h" |
| #include "../moduledb.h" |
| #include "aslr.h" |
| #include "../nudge.h" /* for generic_nudge_target() address */ |
| #ifdef RETURN_AFTER_CALL |
| # include "../rct.h" /* for rct_ind_branch_target_lookup */ |
| #endif |
| #include "instrument.h" |
| #include "../perscache.h" |
| #include "../translate.h" |
| |
| #include <string.h> /* for memcpy */ |
| #include <windows.h> |
| |
| /* forward declarations */ |
| static dcontext_t * callback_setup(app_pc next_pc); |
| static byte * insert_image_entry_trampoline(dcontext_t *dcontext); |
| static void swap_dcontexts(dcontext_t *done, dcontext_t *dtwo); |
| static void asynch_take_over(app_state_at_intercept_t *state); |
| |
| /* currently we do not intercept top level exceptions */ |
| #ifdef INTERCEPT_TOP_LEVEL_EXCEPTIONS |
| /* the app's top-level exception handler */ |
| static LPTOP_LEVEL_EXCEPTION_FILTER app_top_handler; |
| #endif |
| |
| /* All of our hooks use landing pads to then indirectly target |
| * this interception code, which in turn assumes it can directly |
| * reach our hook targets in the DR lib. Thus, we want this |
| * interception buffer to not be in vmcode nor vmheap, but near the |
| * DR lib: which is simplest with a static array. |
| * We write-protect this, so we don't need the ASLR of our heap. |
| */ |
| ALIGN_VAR(4096) static byte interception_code_array[INTERCEPTION_CODE_SIZE]; |
| |
| /* interception information |
| * if it weren't for syscall trampolines this could be a single page |
| * Note: if you add more intercept points, make sure to adjust |
| * NUM_INTERCEPT_POINTS below. |
| */ |
| static byte * interception_code = NULL; |
| static byte * interception_cur_pc = NULL; |
| static byte * ldr_init_pc = NULL; |
| static byte * callback_pc = NULL; |
| static byte * apc_pc = NULL; |
| static byte * exception_pc = NULL; |
| static byte * raise_exception_pc = NULL; |
| static byte * after_callback_orig_pc = NULL; |
| static byte * after_apc_orig_pc = NULL; |
| static byte * load_dll_pc = NULL; |
| static byte * unload_dll_pc = NULL; |
| static byte * image_entry_pc = NULL; |
| static byte * image_entry_trampoline = NULL; |
| static byte * syscall_trampolines_start = NULL; |
| static byte * syscall_trampolines_end = NULL; |
| |
| /* We rely on the compiler doing the right thing |
| so when we dereference an imported function we get its real address |
| instead of a stub in our module. The loader does the rest of the magic. |
| */ |
| GET_NTDLL(KiUserApcDispatcher, (IN PVOID Unknown1, |
| IN PVOID Unknown2, |
| IN PVOID Unknown3, |
| IN PVOID ContextStart, |
| IN PVOID ContextBody)); |
| GET_NTDLL(KiUserCallbackDispatcher, (IN PVOID Unknown1, |
| IN PVOID Unknown2, |
| IN PVOID Unknown3)); |
| GET_NTDLL(KiUserExceptionDispatcher, (IN PVOID Unknown1, |
| IN PVOID Unknown2)); |
| GET_NTDLL(KiRaiseUserExceptionDispatcher, (void)); |
| |
| /* generated routine for taking over native threads */ |
| byte *thread_attach_takeover; |
| |
| static byte * |
| emit_takeover_code(byte *pc); |
| |
| /* For detach */ |
| bool init_apc_go_native = false; |
| bool init_apc_go_native_pause = false; |
| |
| /* overridden by dr_preinjected, or retakeover_after_native() */ |
| static retakeover_point_t interception_point = INTERCEPT_PREINJECT; |
| |
| /* While emiting the trampoline, the alt. target is unknown for hotp_only. */ |
| #define CURRENTLY_UNKNOWN ((byte *)(ptr_uint_t) 0xdeadc0de) |
| |
| #ifdef DEBUG |
| #define INTERCEPT_POINT(point) STRINGIFY(point), |
| static const char * const retakeover_names[] = { |
| INTERCEPT_ALL_POINTS |
| }; |
| #undef INTERCEPT_POINT |
| #endif |
| |
| /* We keep a list of mappings from intercept points to original app PCs */ |
| typedef struct _intercept_map_elem_t { |
| byte *interception_pc; |
| app_pc original_app_pc; |
| size_t displace_length; /* includes jmp back */ |
| size_t orig_length; |
| struct _intercept_map_elem_t *next; |
| } intercept_map_elem_t; |
| |
| typedef struct _intercept_map_t { |
| intercept_map_elem_t *head; |
| intercept_map_elem_t *tail; |
| } intercept_map_t; |
| |
| static intercept_map_t *intercept_map; |
| |
| DECLARE_CXTSWPROT_VAR(static mutex_t map_intercept_pc_lock, |
| INIT_LOCK_FREE(map_intercept_pc_lock)); |
| |
| DECLARE_CXTSWPROT_VAR(static mutex_t emulate_write_lock, |
| INIT_LOCK_FREE(emulate_write_lock)); |
| |
| #ifdef STACK_GUARD_PAGE |
| DECLARE_CXTSWPROT_VAR(static mutex_t exception_stack_lock, |
| INIT_LOCK_FREE(exception_stack_lock)); |
| #endif |
| |
| DECLARE_CXTSWPROT_VAR(static mutex_t intercept_hook_lock, |
| INIT_LOCK_FREE(intercept_hook_lock)); |
| |
| /* Only used for Vista, new threads start directly here instead of going |
| * through KiUserApcDispatcher first. Isn't in our lib (though is exported |
| * on 2k, xp and vista at least) so we get it dynamically. */ |
| static byte *LdrInitializeThunk = NULL; |
| /* On vista this is the address the kernel sets (via NtCreateThreadEx, used by all the |
| * api routines) as Xip in the context the LdrInitializeThunk NtContinue's to (is eqv. |
| * to the unexported kernel32!Base[Process,Thread]StartThunk in pre-Vista). Fortunately |
| * ntdll!RtlUserThreadStart is exported and we cache it here for use in |
| * intercept_new_thread(). Note that threads created by the legacy native |
| * NtCreateThread don't have to target this address. */ |
| static byte *RtlUserThreadStart = NULL; |
| |
| #ifndef X64 |
| /* Used to create a clean syscall wrapper on win8 where there's no ind call */ |
| static byte *KiFastSystemCall = NULL; |
| #endif |
| |
| /* i#1443: we need to identify threads queued up waiting for DR init. |
| * We can't use heap of course so we have to use a max count. |
| * We've never seen more than one at a time. |
| */ |
| #define MAX_THREADS_WAITING_FOR_DR_INIT 8 |
| /* We assume INVALID_THREAD_ID is 0 (checked in callback_init()). */ |
| /* These need to be neverprot for use w/ new threads. The risk is small. */ |
| DECLARE_NEVERPROT_VAR(static thread_id_t threads_waiting_for_dr_init |
| [MAX_THREADS_WAITING_FOR_DR_INIT], {0}); |
| /* This is also the next index+1 into the array to write to, incremented atomically. */ |
| DECLARE_NEVERPROT_VAR(static uint threads_waiting_count, 0); |
| |
| static inline app_pc |
| get_setcontext_interceptor() |
| { |
| return (app_pc) nt_continue_dynamo_start; |
| } |
| |
| /* if tid != self, must hold thread_initexit_lock */ |
| void |
| set_asynch_interception(thread_id_t tid, bool intercept) |
| { |
| /* Needed to turn on and off asynchronous event interception |
| * for non-entire-application-under-dynamo-control situations |
| */ |
| thread_record_t *tr = thread_lookup(tid); |
| ASSERT(tr != NULL); |
| tr->under_dynamo_control = intercept; |
| } |
| |
| static inline bool |
| intercept_asynch_global() |
| { |
| return (intercept_asynch && !INTERNAL_OPTION(nullcalls)); |
| } |
| |
| /* if tr is not for calling thread, must hold thread_initexit_lock */ |
| static bool |
| intercept_asynch_common(thread_record_t *tr, bool intercept_unknown) |
| { |
| if (!intercept_asynch_global()) |
| return false; |
| if (tr == NULL) { |
| if (intercept_unknown) |
| return true; |
| /* caller should have made all attempts to get tr */ |
| if (control_all_threads) { |
| /* we should know about all threads! */ |
| SYSLOG_INTERNAL_WARNING("Received asynch event for unknown thread "TIDFMT"", get_thread_id()); |
| /* try to make everything run rather than assert -- just do |
| * this asynch natively, we probably received it for a thread that's |
| * been created but not scheduled? |
| */ |
| } |
| return false; |
| } |
| /* FIXME: under_dynamo_control should be an enum w/ separate |
| * values for 1) truly native, 2) under DR but currently native_exec, |
| * 3) temporarily native b/c DR lost control (== UNDER_DYN_HACK), and |
| * 4) fully under DR |
| */ |
| DOSTATS({ |
| if (IS_UNDER_DYN_HACK(tr->under_dynamo_control)) |
| STATS_INC(num_asynch_while_lost); |
| }); |
| return (tr->under_dynamo_control || IS_CLIENT_THREAD(tr->dcontext)); |
| } |
| |
| /* if tid != self, must hold thread_initexit_lock */ |
| bool |
| intercept_asynch_for_thread(thread_id_t tid, bool intercept_unknown) |
| { |
| /* Needed to turn on and off asynchronous event interception |
| * for non-entire-application-under-dynamo-control situations |
| */ |
| thread_record_t *tr = thread_lookup(tid); |
| return intercept_asynch_common(tr, intercept_unknown); |
| } |
| |
| bool |
| intercept_asynch_for_self(bool intercept_unknown) |
| { |
| /* To avoid problems with the all_threads_lock required to look |
| * up a thread in the thread table, we first see if it has a |
| * dcontext, and if so we get the thread_record_t from there. |
| * If not, it probably is a native thread and grabbing the lock |
| * should cause no problems as it should not currently be holding |
| * any locks. |
| */ |
| dcontext_t *dcontext = get_thread_private_dcontext(); |
| if (dcontext != NULL) |
| return intercept_asynch_common(dcontext->thread_record, intercept_unknown); |
| else |
| return intercept_asynch_for_thread(get_thread_id(), intercept_unknown); |
| } |
| |
| |
| /*************************************************************************** |
| * INTERCEPTION CODE FOR TRAMPOLINES INSERTED INTO APPLICATION CODE |
| |
| interception code either assumes that the app's xsp is valid, or uses |
| dstack if available, or as a last resort uses initstack. when using |
| initstack, must make sure all paths exiting handler routine clear the |
| initstack mutex once not using the initstack itself! |
| |
| We clobber TIB->PID, which is believed to be safe since no user-mode |
| code will execute there b/c thread is not alertable, and the kernel |
| shouldn't be reading (and trusting) user mode TIB structures. |
| FIXME: allocate and use a TIB scratch slot instead |
| |
| N.B.: this interception code, if encountered by DR, is let run natively, |
| so make sure DR takes control at the end! |
| |
| For trying to use the dstack, we have to be careful and check if we're |
| already on the dstack, which can happen for internal exceptions -- |
| hopefully not for callbacks or apcs, we should assert on that => |
| FIXME: add such checks to the cb and apc handlers, and split dstack |
| check as a separate parameter, once we make cbs and apcs not |
| assume_xsp (they still do for now since we haven't tested enough to |
| convince ourselves we never get them while on the dstack) |
| |
| Unfortunately there's no easy way to check w/o modifying flags, so for |
| now we assume eflags whenever we do not assume xsp, unless we assume |
| we're not on the dstack. |
| Assumption should be ok for Ki*, also for Ldr*. |
| |
| Alternatives: check later when we're in exception handler, only paths |
| there are terminate or forge exception. Thus we can get away w/o |
| reading anything on stack placed by kernel, but we won't have clean |
| call stack or anything else for diagnostics, and we'll have clobbered |
| the real xsp in the mcontext slot, which we use for forging the |
| exception. |
| |
| Could perhaps use whereami==WHERE_FCACHE, but could get exception during |
| clean call or cxt switch when on dstack but prior to whereami change. |
| |
| Note: the app registers passed to the handler are restored when going back to |
| the app, which means any changes made by the handler will be reflected |
| in the app state; |
| FIXME: change handler prototype to make all registers volatile so that the |
| compiler doesn't clobber them; for now it is the user's responsibility. |
| |
| if (!assume_xsp) |
| mov xcx, fs:$PID_TIB_OFFSET # save xcx |
| mov fs:$TLS_DCONTEXT_OFFSET, xcx |
| jecxz no_local_stack |
| if (!assume_not_on_dstack) |
| # need to check if already on dstack |
| # assumes eflags! |
| mov $DSTACK(xcx), xcx |
| cmp xsp, xcx |
| jge not_on_dstack |
| lea -DYNAMORIO_STACK_SIZE(xcx), xcx |
| cmp xsp, xcx |
| jl not_on_dstack |
| # record stack method: using dstack/initstack unmodified |
| push xsp |
| push $2 |
| jmp have_stack_now |
| not_on_dstack: |
| mov fs:$TLS_DCONTEXT_OFFSET, xcx |
| endif |
| # store app xsp in dcontext & switch to dstack; this will be used to save |
| # app xsp on the switched stack, i.e., dstack; not used after that. |
| if TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask) |
| mov $MCONTEXT_OFFSET(xcx), xcx |
| endif |
| mov xsp, $XSP_OFFSET(xcx) |
| if TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask) |
| mov fs:$TLS_DCONTEXT_OFFSET, xcx |
| endif |
| mov $DSTACK(xcx), xsp |
| |
| # now get the app xsp from the dcontext and put it on the dstack; this |
| # will serve as the app xsp cache and will be used to send the correct |
| # app xsp to the handler and to restore app xsp at exit |
| if TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask) |
| mov $MCONTEXT_OFFSET(xcx), xcx |
| endif |
| mov $XSP_OFFSET(xcx), xcx |
| push xcx |
| |
| # need to record stack method, since dcontext could change in handler |
| push $1 |
| jmp have_stack_now |
| no_local_stack: |
| # use initstack -- it's a global synch, but should only have no |
| # dcontext for initializing thread (where we actually use the app's |
| # stack) or exiting thread |
| # If we are already on the initstack, should just continue to use it. |
| # need to check if already on initstack |
| # assumes eflags, but we already did on this path for checking dstack |
| mov $INITSTACK, xcx |
| cmp xsp, xcx |
| jge grab_initstack |
| lea -DYNAMORIO_STACK_SIZE(xcx), xcx |
| cmp xsp, xcx |
| jl grab_initstack |
| push xsp |
| # record stack method: using dstack/initstack unmodified |
| push $2 |
| jmp have_stack_now |
| grab_initstack: |
| mov $1, ecx # upper 32 bits zeroed on x64 |
| if x64 # can't directly address initstack_mutex or initstack_app_xsp |
| # (though we could use rip-relative, nice to not have reachability issues |
| # if located far from dynamorio.dll, for general hooks (PR 250294)!) |
| # if a new thread we can't easily (w/o syscall) replace tid, so we use peb |
| mov xax, fs:$PEB_TIB_OFFSET # save xax |
| endif |
| get_lock: |
| if x64 # can't directly address initstack_mutex or initstack_app_xsp |
| mov $INITSTACK_MUTEX, xax |
| endif |
| # initstack_mutex.lock_requests is 32-bit |
| xchg ecx, IF_X64_ELSE((xax), initstack_mutex) |
| jecxz have_lock |
| pause # improve spin-loop perf on P4 |
| jmp get_lock # no way to sleep or anything, must spin |
| have_lock: |
| # app xsp is saved in initstack_app_xsp only so that it can be accessed after |
| # switching to initstack; used only to set up the app xsp on the initstack |
| if x64 # we don't need to set initstack_app_xsp, just push the app xsp value |
| mov xsp, xcx |
| mov initstack, xax |
| xchg xax, xsp |
| push xcx |
| else |
| mov xsp, initstack_app_xsp |
| mov initstack, xsp |
| push initstack_app_xsp |
| endif |
| # need to record stack method, since dcontext could change in handler |
| push $0 |
| if x64 |
| mov $peb_ptr, xax |
| xchg fs:$PEB_TIB_OFFSET, xax # restore xax and peb ptr |
| endif |
| have_stack_now: |
| if x64 |
| mov $global_pid, xcx |
| xchg fs:$PID_TIB_OFFSET, xcx # restore xcx and pid |
| else |
| mov fs:$PID_TIB_OFFSET, xcx # restore xcx |
| mov $global_pid, fs:$PID_TIB_OFFSET # restore TIB PID |
| endif |
| else |
| push xsp # cache app xsp so that it can be used to send the right value |
| # to the handler and to restore app xsp safely at exit |
| push $3 # recording stack type when using app stack |
| endif |
| # we assume here that we've done two pushes on the stack, |
| # which combined w/ the push0 and pushf give us 16-byte alignment |
| # for 32-bit and 64-bit prior to push-all-regs |
| clean_call_setup: |
| # lay out pc, eflags, regs, etc. in app_state_at_intercept_t order |
| push $0 # pc slot; unused; could use instead of state->start_pc |
| pushf |
| pusha (or push all regs for x64) |
| push $0 # ASSUMPTION: clearing, not preserving, is good enough |
| # FIXME: this won't work at CPL0 if we ever run there! |
| popf |
| |
| # get the cached app xsp and write it to pusha location, |
| # so that the handler gets the correct app xsp |
| mov sizeof(priv_mcontext_t)+XSP_SZ(xsp), xax |
| mov xax, offsetof(priv_mcontext_t, xsp)(xsp) |
| |
| if (ENTER_DR_HOOK != NULL) |
| call ENTER_DR_HOOK |
| endif |
| if x64 |
| mov no_cleanup, xax |
| push xax |
| mov handler_arg, xax |
| push xax |
| else |
| push no_cleanup |
| push handler_arg |
| endif |
| # now we've laid out app_state_at_intercept_t on the stack |
| push/mov xsp # a pointer to the pushed values; this is the argument; |
| # see case 7597. may be passed in a register. |
| call handler |
| <clean up args> |
| lea 2*XSP_SZ(xsp), lea # pop handler_arg + no_cleanup |
| if (AFTER_INTERCEPT_DYNAMIC_DECISION) |
| cmp xax, AFTER_INTERCEPT_LET_GO |
| je let_go |
| if (alternate target provided) |
| cmp xax, AFTER_INTERCEPT_LET_GO_ALT_DYN |
| je let_go_alt |
| endif |
| endif |
| if (AFTER_INTERCEPT_TAKE_OVER || AFTER_INTERCEPT_TAKE_OVER_SINGLE_SHOT |
| || AFTER_INTERCEPT_DYNAMIC_DECISION) |
| if x64 |
| mov no_cleanup, xax |
| push xax |
| else |
| push no_cleanup # state->start_pc |
| endif |
| push $0 # we assume always want !save_dcontext as arg to asynch_take_over |
| push/mov xsp # app_state_at_intercept_t * |
| call asynch_take_over |
| # should never reach here |
| push $0 |
| push $-3 # internal_error will report -3 as line number |
| push $0 |
| call internal_error |
| endif |
| if (AFTER_INTERCEPT_DYNAMIC_DECISION && alternate target provided) |
| let_go_alt: |
| <complete duplicate of let_go, but ending in a jmp to alternate target> |
| <(cannot really share much of let_go cleanup w/o more scratch slots)> |
| <(has to be first since original app instrs are placed by caller, not us)> |
| endif |
| if (!AFTER_INTERCEPT_TAKE_OVER) |
| let_go: |
| if (EXIT_DR_HOOK != NULL) |
| call EXIT_DR_HOOK |
| endif |
| # get the xsp passed to the handler, which may have been |
| # changed; store it in the xsp cache to restore at exit |
| mov offsetof(priv_mcontext_t, xsp)(xsp), xax |
| mov xax, sizeof(priv_mcontext_t)+XSP_SZ(xsp) |
| popa # or pop all regs on x64 |
| popf |
| lea XSP_SZ(xsp), xsp # clear pc slot |
| if (!assume_xsp) |
| mov xcx, fs:$PID_TIB_OFFSET # save xcx |
| pop xcx # get back const telling stack used |
| pop xsp |
| jecxz restore_initstack |
| jmp done_restoring |
| restore_initstack: |
| if x64 |
| mov &initstack_mutex, xcx |
| mov $0, (xcx) |
| else |
| mov $0, initstack_mutex |
| endif |
| done_restoring: |
| if x64 |
| mov $global_pid, xcx |
| xchg fs:$PID_TIB_OFFSET, xcx # restore xcx and pid |
| else |
| mov fs:$PID_TIB_OFFSET, xcx # restore xcx |
| mov $global_pid, fs:$PID_TIB_OFFSET # restore TIB PID |
| endif |
| else |
| lea XSP_SZ(xsp), xsp # clear out the stack type |
| pop xsp # handler may have changed xsp; so get it from the xsp cache |
| endif |
| endif (!AFTER_INTERCEPT_TAKE_OVER) |
| no_cleanup: |
| <original app instructions> |
| |
| => handler signature, exported as typedef intercept_function_t: |
| void handler(app_state_at_intercept_t *args) |
| |
| if AFTER_INTERCEPT_TAKE_OVER, then asynch_take_over is called, with "false" for |
| its save_dcontext parameter |
| |
| handler must make sure all paths exiting handler routine clear the |
| initstack mutex once not using the initstack itself! |
| |
| */ |
| |
| #define APP instrlist_append |
| |
| /* common routine separate since used for let go and alternate let go */ |
| static void |
| insert_let_go_cleanup(dcontext_t *dcontext, byte *pc, instrlist_t *ilist, |
| instr_t *decision, bool assume_xsp, bool assume_not_on_dstack, |
| after_intercept_action_t action_after) |
| { |
| instr_t *first = NULL; |
| if (action_after == AFTER_INTERCEPT_DYNAMIC_DECISION) { |
| /* placeholder so can find 1st of this path */ |
| first = instrlist_last(ilist); |
| } |
| |
| if (EXIT_DR_HOOK != NULL) { |
| /* make sure to use dr_insert_call() rather than a raw OP_call instr, |
| * since x64 windows requires 32 bytes of stack space even w/ no args. |
| */ |
| IF_DEBUG(bool direct = ) |
| dr_insert_call_ex((void *)dcontext, ilist, NULL/*append*/, |
| /* we're not in vmcode, so avoid indirect call */ |
| pc, (void *)EXIT_DR_HOOK, 0); |
| ASSERT(direct); |
| } |
| |
| /* Get the app xsp passed to the handler from the popa location and store |
| * it in the app xsp cache; this is because the handler could have changed |
| * the app xsp that was passed to it. CAUTION: do this before the popa. |
| */ |
| APP(ilist, INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XAX), |
| OPND_CREATE_MEMPTR(REG_XSP, |
| offsetof(priv_mcontext_t, xsp)))); |
| APP(ilist, INSTR_CREATE_mov_st(dcontext, |
| OPND_CREATE_MEMPTR(REG_XSP, |
| sizeof(priv_mcontext_t)+XSP_SZ), |
| opnd_create_reg(REG_XAX))); |
| /* now restore everything */ |
| insert_pop_all_registers(dcontext, NULL, ilist, NULL, XSP_SZ/*see push_all use*/); |
| |
| if (action_after == AFTER_INTERCEPT_DYNAMIC_DECISION) { |
| /* now that instrs are there, take 1st */ |
| ASSERT(first != NULL); |
| instr_set_target(decision, opnd_create_instr(instr_get_next(first))); |
| } |
| |
| if (!assume_xsp) { |
| instr_t *restore_initstack = INSTR_CREATE_label(dcontext); |
| instr_t *done_restoring = INSTR_CREATE_label(dcontext); |
| APP(ilist, |
| INSTR_CREATE_mov_st(dcontext, |
| opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, |
| 0, PID_TIB_OFFSET, OPSZ_PTR), |
| opnd_create_reg(REG_XCX))); |
| APP(ilist, |
| INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_XCX))); |
| |
| /* popa doesn't restore xsp; the handler might have changed it, so |
| * restore it from the app xsp cache, which is now the top of stack. |
| */ |
| APP(ilist, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_XSP))); |
| APP(ilist, INSTR_CREATE_jecxz(dcontext, opnd_create_instr(restore_initstack))); |
| APP(ilist, |
| INSTR_CREATE_jmp(dcontext, opnd_create_instr(done_restoring))); |
| /* use initstack to avoid any assumptions about app xsp */ |
| APP(ilist, restore_initstack); |
| #ifdef X64 |
| APP(ilist, |
| INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XCX), |
| OPND_CREATE_INTPTR((ptr_uint_t)&initstack_mutex))); |
| #endif |
| APP(ilist, |
| INSTR_CREATE_mov_st(dcontext, |
| IF_X64_ELSE(OPND_CREATE_MEM32(REG_XCX, 0), |
| OPND_CREATE_ABSMEM((void *)&initstack_mutex, |
| OPSZ_4)), |
| OPND_CREATE_INT32(0))); |
| APP(ilist, done_restoring); |
| #ifdef X64 |
| /* we could perhaps assume the top 32 bits of win32_pid are zero, but |
| * xchg works just as well */ |
| APP(ilist, |
| INSTR_CREATE_mov_imm(dcontext, |
| opnd_create_reg(REG_XCX), |
| OPND_CREATE_INTPTR((ptr_uint_t)win32_pid))); |
| APP(ilist, |
| INSTR_CREATE_xchg(dcontext, |
| opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, |
| 0, PID_TIB_OFFSET, OPSZ_PTR), |
| opnd_create_reg(REG_XCX))); |
| #else |
| APP(ilist, |
| INSTR_CREATE_mov_ld(dcontext, |
| opnd_create_reg(REG_XCX), |
| opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, |
| 0, PID_TIB_OFFSET, OPSZ_PTR))); |
| APP(ilist, |
| INSTR_CREATE_mov_st(dcontext, |
| opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, |
| 0, PID_TIB_OFFSET, OPSZ_PTR), |
| OPND_CREATE_INTPTR(win32_pid))); |
| #endif |
| } else { |
| /* popa doesn't restore xsp; the handler might have changed it, so |
| * restore it from the app xsp cache, which is now the top of stack. |
| */ |
| APP(ilist, INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP), |
| opnd_create_base_disp(REG_XSP, REG_NULL, 0, |
| XSP_SZ, OPSZ_0))); |
| APP(ilist, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_XSP))); |
| } |
| } |
| |
| /* Emits a landing pad (shown below) and returns the address to the first |
| * instruction in it. Also returns the address where displaced app |
| * instrs should be copied in displaced_app_loc. |
| * |
| * The caller must call finalize_landing_pad_code() once finished copying |
| * the displaced app code, passing in the changed_prot value it received |
| * from this routine. |
| * |
| * CAUTION: These landing pad layouts are assumed in intercept_call() and in |
| * read_and_verify_dr_marker(), must_not_be_elided(), and |
| * is_syscall_trampoline(). |
| *ifndef X64 |
| * 32-bit landing pad: |
| * jmp tgt_pc ; 5 bytes, 32-bit relative jump |
| * displaced app instr(s) ; < (JMP_LONG_LENGTH + MAX_INSTR_LENGTH) bytes |
| * jmp after_hook_pc ; 5 bytes, 32-bit relative jump |
| *else |
| * 64-bit landing pad: |
| * tgt_pc ; 8 bytes of absolute address, i.e., tgt_pc |
| * jmp [tgt_pc] ; 6 bytes, 64-bit absolute indirect jmp |
| * displaced app instr(s) ; < (JMP_LONG_LENGTH + MAX_INSTR_LENGTH) bytes |
| * jmp after_hook_pc ; 5 bytes, 32-bit relative jump |
| *endif |
| * |
| * Note: For 64-bit landing pad, tgt_pc can be stored at the bottom of the |
| * trampoline too. I chose the top because it helps avoid a minor reachability |
| * problem: iff the landing pad is allocated at the topmost part of the |
| * reachability region for a given addr_to_hook, then there is a possibility |
| * that the return jmp from the landing pad may not reach the instruction after |
| * the hook address. This is because the number of bytes of the hook (5 bytes) |
| * and the number of bytes of the instruction(s) clobbered at the hook point |
| * might be different. If the clobbered bytes are more than 5 bytes, then the |
| * return jmp from the landing pad won't be able to reach it. By placing 8 |
| * bytes above the landing pad, we give it the extra reachability needed. |
| * Also, having the tgt_pc at the top of the landing pad makes it easy to see |
| * the disassembly of the whole landing pad while debugging, else there will be |
| * jmp and garbage after it. |
| * |
| * This isn't a problem for 32-bit landing pad because in 32-bit everything is |
| * reachable. |
| * |
| * We must put the displaced app instr(s) in the landing pad for x64 |
| * b/c they may contain rip-rel data refs and those may not reach if |
| * in the main trampoline (i#902). |
| * |
| * See heap.c for details about what landing pads are. |
| */ |
| #define JMP_SIZE (IF_X64_ELSE(JMP_ABS_IND64_SIZE, JMP_REL32_SIZE)) |
| static byte * |
| emit_landing_pad_code(byte *lpad_buf, const byte *tgt_pc, |
| const byte *after_hook_pc, |
| size_t displaced_app_size, |
| byte **displaced_app_loc OUT, |
| bool *changed_prot) |
| { |
| byte *lpad_entry = lpad_buf; |
| bool res; |
| byte *lpad_start = lpad_buf; |
| ASSERT(lpad_buf != NULL); |
| |
| res = make_hookable(lpad_buf, LANDING_PAD_SIZE, changed_prot); |
| ASSERT(res); |
| |
| #ifndef X64 |
| *lpad_buf = JMP_REL32_OPCODE; |
| lpad_buf++; |
| *((int *)lpad_buf) = (int)(tgt_pc - lpad_buf - 4); |
| lpad_buf += 4; |
| #else |
| *((byte **)lpad_buf) = (byte *)tgt_pc; /* save tgt_pc for the rip-rel jmp */ |
| lpad_buf += sizeof(tgt_pc); |
| lpad_entry = lpad_buf; /* entry is after the first 8 bytes */ |
| *lpad_buf = JMP_ABS_IND64_OPCODE; |
| lpad_buf++; |
| *lpad_buf = JMP_ABS_MEM_IND64_MODRM; |
| lpad_buf++; |
| /* rip relative address to 8-bytes, i.e., start of lpad_buf */ |
| *((int *)lpad_buf) = -(int)(JMP_ABS_IND64_SIZE + sizeof(tgt_pc)); |
| lpad_buf += 4; |
| #endif |
| |
| /* Leave space for the displaced app code */ |
| ASSERT(displaced_app_size < MAX_HOOK_DISPLACED_LENGTH); |
| ASSERT(displaced_app_loc != NULL); |
| *displaced_app_loc = lpad_buf; |
| lpad_buf += displaced_app_size; |
| |
| /* The return 32-bit relative jump is common to both 32-bit and 64-bit |
| * landing pads. Make sure that the second jmp goes into the right address. |
| */ |
| ASSERT((size_t)(lpad_buf - lpad_start) == |
| JMP_SIZE IF_X64(+ sizeof(tgt_pc)) + displaced_app_size); |
| *lpad_buf = JMP_REL32_OPCODE; |
| lpad_buf++; |
| *((int *)lpad_buf) = (int)(after_hook_pc - lpad_buf - 4); |
| lpad_buf += 4; |
| |
| /* Even though we have the 8 byte space up front for 64-bit, just make sure |
| * that the return jmp can reach the instruction after the hook. |
| */ |
| ASSERT(REL32_REACHABLE(lpad_buf, after_hook_pc)); |
| |
| /* Make sure that the landing pad size match with definitions. */ |
| ASSERT(lpad_buf - lpad_start <= LANDING_PAD_SIZE); |
| |
| /* Return unused space */ |
| trim_landing_pad(lpad_start, lpad_buf - lpad_start); |
| |
| return lpad_entry; |
| } |
| |
| static void |
| finalize_landing_pad_code(byte *lpad_buf, bool changed_prot) |
| { |
| make_unhookable(lpad_buf, LANDING_PAD_SIZE, changed_prot); |
| } |
| |
| /* Assumes that ilist contains decoded instrs for [start_pc, start_pc+size). |
| * Copies size bytes of the app code at start_pc into buf by encoding |
| * the ilist, re-relativizing rip-relative and ctis as it goes along. |
| * Also converts short ctis into 32-bit-offset ctis. |
| * |
| * hotp_only does not support ctis in the middle of the ilist, only at |
| * the end, nor size changes in the middle of the ilist: to support |
| * that we'd need a relocation table mapping old instruction offsets |
| * to the newly emitted ones. |
| * |
| * As of today only one cti is allowed in a patch region and that too at |
| * the end of it, so the starting location of that cti won't change even if we |
| * convert and re-relativize it. This means hot patch control flow changes into |
| * the middle of a patch region won't have to worry about using an offset table. |
| * |
| * The current patch region definition doesn't allow ctis to be in the |
| * middle of patch regions. This means we don't have to worry about |
| * re-relativizing ctis in the middle of a patch region. However Alex has an |
| * argument about allowing cbrs to be completely inside a patch region as |
| * control flow can never reach the following instruction other than fall |
| * through, i.e., not from outside. This is a matter for debate, but one |
| * which will need the ilist & creating the relocation table per patch point. |
| */ |
| static byte * |
| copy_app_code(dcontext_t *dcontext, const byte *start_pc, |
| byte *buf, size_t size, instrlist_t *ilist) |
| { |
| instr_t *instr; |
| byte *buf_nxt; |
| DEBUG_DECLARE(byte *buf_start = buf;) |
| DEBUG_DECLARE(bool size_change = false;) |
| ASSERT(dcontext != NULL && start_pc != NULL && buf != NULL); |
| /* Patch region should be at least 5 bytes in length, but no more than 5 |
| * plus the length of the last instruction in the region. |
| */ |
| ASSERT(size >= 5 && size < |
| (size_t)(5 + instr_length(dcontext, instrlist_last(ilist)))); |
| |
| /* We have to walk the instr list to lengthen short (8-bit) ctis */ |
| for (instr = instrlist_first(ilist); instr != NULL; instr = instr_get_next(instr)) { |
| /* For short ctis in the loop to jecxz range, the cti conversion |
| * will set the target in the raw bits, so the raw bits will be valid. |
| * For other short ctis, the conversion will invalidate the raw bits, |
| * so a full encoding is enforced. For other ctis, the raw bits aren't |
| * valid for encoding because we are relocating them; so invalidate |
| * them explicitly. |
| */ |
| if (instr_opcode_valid(instr) && instr_is_cti(instr)) { |
| if (instr_is_cti_short(instr)) { |
| DODEBUG({ size_change = true; }); |
| convert_to_near_rel(dcontext, instr); |
| } else |
| instr_set_raw_bits_valid(instr, false); |
| /* see notes above: hotp_only doesn't support non-final cti */ |
| ASSERT(!instr_is_cti(instr) || instr == instrlist_last(ilist)); |
| } |
| #ifdef X64 |
| /* If we have reachability issues, instrlist_encode() below |
| * will fail. We try to do an assert here for that case |
| * (estimating where the relative offset will be encoded at). |
| * PR 250294's heap pad proposal will solve this. |
| */ |
| DOCHECK(1, { |
| app_pc target; |
| instr_get_rel_addr_target(instr, &target); |
| ASSERT_NOT_IMPLEMENTED |
| ((!instr_has_rel_addr_reference(instr) || |
| REL32_REACHABLE(buf, target)) && |
| "PR 250294: displaced code too far from rip-rel target"); |
| }); |
| #endif |
| } |
| |
| /* now encode and re-relativize x64 rip-relative instructions */ |
| buf_nxt = instrlist_encode(dcontext, ilist, buf, false/*no instr_t targets*/); |
| ASSERT(buf_nxt != NULL); |
| ASSERT((buf_nxt - buf) == (ssize_t)size || |
| size_change && (buf_nxt - buf) > (ssize_t)size); |
| return buf_nxt; |
| } |
| |
| /* N.B.: !assume_xsp && !assume_not_on_dstack implies eflags assumptions! |
| * !assume_xsp && assume_not_on_dstack does not assume eflags. |
| * Could optimize by having a bool indicating whether to have a callee arg or not, |
| * but then the intercept_function_t typedef must be void, or must have two, so we |
| * just make every callee take an arg. |
| * |
| * Currently only hotp_only uses alt_after_tgt_p. It points at the pointer-sized |
| * target that initially has the value alternate_after. It is NOT intra-cache-line |
| * aligned and thus if the caller wants a hot-patchable target it must |
| * have another layer of indirection. |
| */ |
| static byte * |
| emit_intercept_code(dcontext_t *dcontext, byte *pc, intercept_function_t callee, |
| void *callee_arg, bool assume_xsp, bool assume_not_on_dstack, |
| after_intercept_action_t action_after, byte *alternate_after, |
| byte **alt_after_tgt_p OUT) |
| { |
| instrlist_t ilist; |
| instr_t *inst, *push_start, *push_start2 = NULL; |
| instr_t *decision = NULL, *alt_decision = NULL, *alt_after = NULL; |
| uint len; |
| byte *start_pc, *push_pc, *push_pc2 = NULL; |
| app_pc no_cleanup; |
| uint stack_offs = 0; |
| IF_DEBUG(bool direct;) |
| |
| /* AFTER_INTERCEPT_LET_GO_ALT_DYN is used only dynamically to select alternate */ |
| ASSERT(action_after != AFTER_INTERCEPT_LET_GO_ALT_DYN); |
| |
| /* alternate_after provided only when possibly using alternate target */ |
| ASSERT(alternate_after == NULL || |
| action_after == AFTER_INTERCEPT_DYNAMIC_DECISION || |
| action_after == AFTER_INTERCEPT_TAKE_OVER_SINGLE_SHOT); |
| |
| /* initialize the ilist */ |
| instrlist_init(&ilist); |
| |
| if (!assume_xsp) { |
| instr_t *no_local_stack = INSTR_CREATE_label(dcontext); |
| instr_t *grab_initstack = INSTR_CREATE_label(dcontext); |
| instr_t *get_lock = INSTR_CREATE_label(dcontext); |
| instr_t *have_lock = INSTR_CREATE_label(dcontext); |
| instr_t *have_stack_now = INSTR_CREATE_label(dcontext); |
| APP(&ilist, |
| INSTR_CREATE_mov_st(dcontext, |
| opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, |
| 0, PID_TIB_OFFSET, OPSZ_PTR), |
| opnd_create_reg(REG_XCX))); |
| APP(&ilist, |
| INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XCX), |
| opnd_create_tls_slot(os_tls_offset(TLS_DCONTEXT_SLOT)))); |
| APP(&ilist, INSTR_CREATE_jecxz(dcontext, opnd_create_instr(no_local_stack))); |
| |
| if (!assume_not_on_dstack) { |
| instr_t *not_on_dstack = INSTR_CREATE_label(dcontext); |
| APP(&ilist, |
| instr_create_restore_from_dc_via_reg(dcontext, REG_XCX, REG_XCX, |
| DSTACK_OFFSET)); |
| APP(&ilist, |
| INSTR_CREATE_cmp(dcontext, opnd_create_reg(REG_XSP), |
| opnd_create_reg(REG_XCX))); |
| APP(&ilist, |
| INSTR_CREATE_jcc(dcontext, OP_jge, opnd_create_instr(not_on_dstack))); |
| APP(&ilist, |
| INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XCX), |
| opnd_create_base_disp(REG_XCX, REG_NULL, 0, |
| -(int)DYNAMORIO_STACK_SIZE, OPSZ_0))); |
| APP(&ilist, |
| INSTR_CREATE_cmp(dcontext, opnd_create_reg(REG_XSP), |
| opnd_create_reg(REG_XCX))); |
| APP(&ilist, |
| INSTR_CREATE_jcc(dcontext, OP_jl, opnd_create_instr(not_on_dstack))); |
| APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XSP))); |
| APP(&ilist, |
| INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(2))); |
| APP(&ilist, |
| INSTR_CREATE_jmp(dcontext, opnd_create_instr(have_stack_now))); |
| APP(&ilist, not_on_dstack); |
| APP(&ilist, INSTR_CREATE_mov_ld |
| (dcontext, opnd_create_reg(REG_XCX), |
| opnd_create_tls_slot(os_tls_offset(TLS_DCONTEXT_SLOT)))); |
| } |
| |
| /* Store the app xsp in dcontext and switch to dstack. */ |
| if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) { |
| APP(&ilist, |
| instr_create_restore_from_dc_via_reg(dcontext, REG_XCX, REG_XCX, |
| PROT_OFFS)); |
| } |
| APP(&ilist, |
| instr_create_save_to_dc_via_reg(dcontext, REG_XCX, REG_XSP, XSP_OFFSET)); |
| if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) { |
| APP(&ilist, INSTR_CREATE_mov_ld |
| (dcontext, opnd_create_reg(REG_XCX), |
| opnd_create_tls_slot(os_tls_offset(TLS_DCONTEXT_SLOT)))); |
| } |
| APP(&ilist, |
| instr_create_restore_from_dc_via_reg(dcontext, REG_XCX, REG_XSP, |
| DSTACK_OFFSET)); |
| |
| /* Get the app xsp from the dcontext and put it on the dstack to serve |
| * as the app xsp cache. |
| */ |
| if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) { |
| APP(&ilist, |
| instr_create_restore_from_dc_via_reg(dcontext,REG_XCX, REG_XCX, |
| PROT_OFFS)); |
| } |
| APP(&ilist, |
| instr_create_restore_from_dc_via_reg(dcontext, REG_XCX, REG_XCX, XSP_OFFSET)); |
| APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XCX))); |
| APP(&ilist, |
| INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(1))); |
| APP(&ilist, |
| INSTR_CREATE_jmp(dcontext, opnd_create_instr(have_stack_now))); |
| |
| /* use initstack to avoid any assumptions about app xsp */ |
| /* first check if we are already on it */ |
| APP(&ilist, no_local_stack); |
| APP(&ilist, |
| INSTR_CREATE_mov_imm(dcontext, |
| opnd_create_reg(REG_XCX), |
| OPND_CREATE_INTPTR((ptr_int_t)initstack))); |
| APP(&ilist, |
| INSTR_CREATE_cmp(dcontext, opnd_create_reg(REG_XSP), |
| opnd_create_reg(REG_XCX))); |
| APP(&ilist, |
| INSTR_CREATE_jcc(dcontext, OP_jge, opnd_create_instr(grab_initstack))); |
| APP(&ilist, |
| INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XCX), |
| opnd_create_base_disp(REG_XCX, REG_NULL, 0, |
| -(int)DYNAMORIO_STACK_SIZE, OPSZ_0))); |
| APP(&ilist, |
| INSTR_CREATE_cmp(dcontext, opnd_create_reg(REG_XSP), |
| opnd_create_reg(REG_XCX))); |
| APP(&ilist, |
| INSTR_CREATE_jcc(dcontext, OP_jl, opnd_create_instr(grab_initstack))); |
| APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XSP))); |
| APP(&ilist, |
| INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(2))); |
| APP(&ilist, |
| INSTR_CREATE_jmp(dcontext, opnd_create_instr(have_stack_now))); |
| APP(&ilist, grab_initstack); |
| APP(&ilist, |
| INSTR_CREATE_mov_imm(dcontext, |
| /* on x64 the upper 32 bits will be zeroed for us */ |
| opnd_create_reg(REG_ECX), OPND_CREATE_INT32(1))); |
| #ifdef X64 |
| APP(&ilist, |
| INSTR_CREATE_mov_st(dcontext, |
| opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, |
| 0, PEB_TIB_OFFSET, OPSZ_PTR), |
| opnd_create_reg(REG_XAX))); |
| #endif |
| APP(&ilist, get_lock); |
| #ifdef X64 |
| APP(&ilist, |
| INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XAX), |
| OPND_CREATE_INTPTR((ptr_uint_t)&initstack_mutex))); |
| #endif |
| APP(&ilist, |
| INSTR_CREATE_xchg(dcontext, |
| /* initstack_mutex is 32 bits always */ |
| IF_X64_ELSE(OPND_CREATE_MEM32(REG_XAX, 0), |
| OPND_CREATE_ABSMEM((void *)&initstack_mutex, |
| OPSZ_4)), |
| opnd_create_reg(REG_ECX))); |
| APP(&ilist, |
| INSTR_CREATE_jecxz(dcontext, opnd_create_instr(have_lock))); |
| APP(&ilist, |
| INSTR_CREATE_pause(dcontext)); |
| APP(&ilist, |
| INSTR_CREATE_jmp(dcontext, opnd_create_instr(get_lock))); |
| APP(&ilist, have_lock); |
| APP(&ilist, |
| INSTR_CREATE_mov_st(dcontext, |
| IF_X64_ELSE(opnd_create_reg(REG_XCX), |
| OPND_CREATE_ABSMEM((void *)&initstack_app_xsp, |
| OPSZ_PTR)), |
| opnd_create_reg(REG_XSP))); |
| #ifdef X64 |
| /* we can do a 64-bit absolute address into xax only */ |
| APP(&ilist, |
| INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XAX), |
| OPND_CREATE_ABSMEM((void *)&initstack, OPSZ_PTR))); |
| APP(&ilist, |
| INSTR_CREATE_xchg(dcontext, opnd_create_reg(REG_XSP), |
| opnd_create_reg(REG_XAX))); |
| #else |
| APP(&ilist, |
| INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XSP), |
| OPND_CREATE_ABSMEM((void *)&initstack, OPSZ_PTR))); |
| #endif |
| APP(&ilist, |
| INSTR_CREATE_push(dcontext, |
| IF_X64_ELSE(opnd_create_reg(REG_XCX), |
| OPND_CREATE_ABSMEM((void *)&initstack_app_xsp, |
| OPSZ_PTR)))); |
| APP(&ilist, |
| INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(0))); |
| #ifdef X64 |
| APP(&ilist, |
| INSTR_CREATE_mov_imm(dcontext, |
| opnd_create_reg(REG_XAX), |
| OPND_CREATE_INTPTR((ptr_uint_t)peb_ptr))); |
| APP(&ilist, |
| INSTR_CREATE_xchg(dcontext, |
| opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, |
| 0, PEB_TIB_OFFSET, OPSZ_PTR), |
| opnd_create_reg(REG_XAX))); |
| #endif |
| APP(&ilist, have_stack_now); |
| #ifdef X64 |
| /* we could perhaps assume the top 32 bits of win32_pid are zero, but |
| * xchg works just as well */ |
| APP(&ilist, |
| INSTR_CREATE_mov_imm(dcontext, |
| opnd_create_reg(REG_XCX), |
| OPND_CREATE_INTPTR((ptr_uint_t)win32_pid))); |
| APP(&ilist, |
| INSTR_CREATE_xchg(dcontext, |
| opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, |
| 0, PID_TIB_OFFSET, OPSZ_PTR), |
| opnd_create_reg(REG_XCX))); |
| #else |
| APP(&ilist, |
| INSTR_CREATE_mov_ld(dcontext, |
| opnd_create_reg(REG_XCX), |
| opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, |
| 0, PID_TIB_OFFSET, OPSZ_PTR))); |
| APP(&ilist, |
| INSTR_CREATE_mov_st(dcontext, |
| opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, |
| 0, PID_TIB_OFFSET, OPSZ_PTR), |
| OPND_CREATE_INTPTR(win32_pid))); |
| #endif /* X64 */ |
| } else { /* assume_xsp */ |
| /* Cache app xsp so that the right value can be passed to the handler |
| * and to restore at exit. Push stack type too: 3 for app stack. |
| */ |
| APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XSP))); |
| APP(&ilist, INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(3))); |
| } |
| |
| /* We assume that if !assume_xsp we've done two pushes on the stack. |
| * DR often only cares about stack alignment for xmm saves. |
| * However, it sometimes calls ntdll routines; and for client exception |
| * handlers that might call random library routines we really care. |
| * We assume that the kernel will make sure of the stack alignment, |
| * so we use stack_offs to make sure of the stack alignment in the |
| * instrumentation. |
| */ |
| stack_offs = insert_push_all_registers |
| (dcontext, NULL, &ilist, NULL, XSP_SZ, |
| /* pc slot not used: could use instead of state->start_pc */ |
| /* sign-extended */ |
| INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(0))); |
| |
| /* clear eflags for callee's usage */ |
| APP(&ilist, |
| INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(0))); |
| APP(&ilist, INSTR_CREATE_RAW_popf(dcontext)); |
| |
| /* Get the cached app xsp and update the pusha's xsp with it; this is the |
| * right app xsp. |
| */ |
| APP(&ilist, |
| INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XAX), |
| OPND_CREATE_MEMPTR(REG_XSP, /* mcxt + stack type */ |
| sizeof(priv_mcontext_t)+XSP_SZ))); |
| APP(&ilist, |
| INSTR_CREATE_mov_st(dcontext, |
| OPND_CREATE_MEMPTR(REG_XSP, |
| offsetof(priv_mcontext_t, xsp)), |
| opnd_create_reg(REG_XAX))); |
| |
| /* FIXME: don't want hooks for trampolines that run natively like |
| * LdrLoadDll or image entry, right? |
| */ |
| if (ENTER_DR_HOOK != NULL) { |
| /* make sure to use dr_insert_call() rather than a raw OP_call instr, |
| * since x64 windows requires 32 bytes of stack space even w/ no args. |
| */ |
| IF_DEBUG(direct = ) |
| dr_insert_call_ex((void *)dcontext, &ilist, NULL/*append*/, |
| /* we're not in vmcode, so avoid indirect call */ |
| pc, (void *)ENTER_DR_HOOK, 0); |
| ASSERT(direct); |
| } |
| |
| /* these are part of app_state_at_intercept_t struct so we have to |
| * push them on the stack, rather than pass in registers |
| */ |
| /* will fill in immed with no_cleanup pointer later */ |
| #ifdef X64 |
| push_start = INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XAX), |
| OPND_CREATE_INTPTR(0)); |
| APP(&ilist, push_start); |
| APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XAX))); |
| APP(&ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XAX), |
| OPND_CREATE_INTPTR(callee_arg))); |
| APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XAX))); |
| #else |
| push_start = INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INTPTR(0)); |
| APP(&ilist, push_start); |
| APP(&ilist, INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INTPTR(callee_arg))); |
| #endif |
| stack_offs += 2*XSP_SZ; |
| |
| /* We pass xsp as a pointer to all the values on the stack; this is the actual |
| * argument to the intercept routine. Fix for case 7597. |
| * -- CAUTION -- if app_state_at_intercept_t changes in anyway, this can |
| * blow up! That structure's field's types, order & layout are assumed |
| * here. These two should change only in synch. |
| */ |
| if (parameters_stack_padded()) { |
| /* xsp won't have proper value due to stack padding */ |
| APP(&ilist, INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XAX), |
| opnd_create_reg(REG_XSP))); |
| #ifdef X64 |
| /* i#331: align the misaligned stack */ |
| # define STACK_ALIGNMENT 16 |
| if (!ALIGNED(stack_offs, STACK_ALIGNMENT)) { |
| ASSERT(ALIGNED(stack_offs, XSP_SZ)); |
| APP(&ilist, INSTR_CREATE_lea |
| (dcontext, opnd_create_reg(REG_XSP), |
| opnd_create_base_disp(REG_XSP, REG_NULL, 0, |
| -(int)XSP_SZ, OPSZ_0))); |
| } |
| #endif |
| } |
| IF_DEBUG(direct = ) |
| dr_insert_call_ex(dcontext, &ilist, NULL, |
| /* we're not in vmcode, so avoid indirect call */ |
| pc, (byte *)callee, 1, |
| parameters_stack_padded() ? opnd_create_reg(REG_XAX) : |
| opnd_create_reg(REG_XSP)); |
| ASSERT(direct); |
| #ifdef X64 |
| /* i#331, misaligned stack adjustment cleanup */ |
| if (parameters_stack_padded()) { |
| if (!ALIGNED(stack_offs, STACK_ALIGNMENT)) { |
| ASSERT(ALIGNED(stack_offs, XSP_SZ)); |
| APP(&ilist, INSTR_CREATE_lea |
| (dcontext, opnd_create_reg(REG_XSP), |
| opnd_create_base_disp(REG_XSP, REG_NULL, 0, |
| XSP_SZ, OPSZ_0))); |
| } |
| } |
| #endif |
| /* clean up 2 pushes */ |
| APP(&ilist, INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP), |
| opnd_create_base_disp(REG_XSP, REG_NULL, 0, |
| 2*XSP_SZ, OPSZ_0))); |
| if (action_after == AFTER_INTERCEPT_DYNAMIC_DECISION) { |
| /* our 32-bit immed will be sign-extended. |
| * perhaps we could assume upper bits not set and use eax to save a rex.w. |
| */ |
| APP(&ilist, INSTR_CREATE_cmp(dcontext, opnd_create_reg(REG_XAX), |
| OPND_CREATE_INT32(AFTER_INTERCEPT_LET_GO))); |
| /* will fill in later */ |
| decision = INSTR_CREATE_jcc(dcontext, OP_je, opnd_create_instr(NULL)); |
| APP(&ilist, decision); |
| if (alternate_after != NULL) { |
| APP(&ilist, INSTR_CREATE_cmp |
| (dcontext, opnd_create_reg(REG_XAX), |
| OPND_CREATE_INT32(AFTER_INTERCEPT_LET_GO_ALT_DYN))); /*sign-extended*/ |
| /* will fill in later */ |
| alt_decision = INSTR_CREATE_jcc(dcontext, OP_je, opnd_create_instr(NULL)); |
| APP(&ilist, alt_decision); |
| } |
| } |
| |
| if (action_after == AFTER_INTERCEPT_TAKE_OVER || |
| action_after == AFTER_INTERCEPT_TAKE_OVER_SINGLE_SHOT || |
| action_after == AFTER_INTERCEPT_DYNAMIC_DECISION) { |
| /* will fill in immed with no_cleanup pointer later */ |
| #ifdef X64 |
| push_start2 = INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XAX), |
| OPND_CREATE_INTPTR(0)); |
| APP(&ilist, push_start2); |
| APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XAX))); |
| #else |
| push_start2 = INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INTPTR(0)); |
| APP(&ilist, push_start2); |
| #endif |
| APP(&ilist, INSTR_CREATE_push_imm(dcontext, |
| OPND_CREATE_INT32(0/*don't save dcontext*/))); |
| if (parameters_stack_padded()) { |
| /* xsp won't have proper value due to stack padding */ |
| APP(&ilist, INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XAX), |
| opnd_create_reg(REG_XSP))); |
| #ifdef X64 |
| /* i#331: align the misaligned stack */ |
| APP(&ilist, INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP), |
| opnd_create_base_disp(REG_XSP, REG_NULL, 0, |
| -(int)XSP_SZ, OPSZ_0))); |
| #endif |
| } |
| IF_DEBUG(direct = ) |
| dr_insert_call_ex(dcontext, &ilist, NULL, |
| /* we're not in vmcode, so avoid indirect call */ |
| pc, (app_pc)asynch_take_over, 1, |
| parameters_stack_padded() ? opnd_create_reg(REG_XAX) : |
| opnd_create_reg(REG_XSP)); |
| ASSERT(direct); |
| #ifdef INTERNAL |
| IF_DEBUG(direct = ) |
| dr_insert_call_ex(dcontext, &ilist, NULL, |
| /* we're not in vmcode, so avoid indirect call */ |
| pc, (app_pc)internal_error, 3, |
| OPND_CREATE_INTPTR(0), |
| OPND_CREATE_INT32(-3), |
| OPND_CREATE_INTPTR(0)); |
| ASSERT(direct); |
| #endif |
| #ifdef X64 |
| if (parameters_stack_padded()) { |
| /* i#331: misaligned stack adjust cleanup*/ |
| APP(&ilist, INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP), |
| opnd_create_base_disp(REG_XSP, REG_NULL, 0, |
| XSP_SZ, OPSZ_0))); |
| } |
| #endif |
| } |
| |
| if (action_after == AFTER_INTERCEPT_LET_GO || |
| action_after == AFTER_INTERCEPT_DYNAMIC_DECISION) { |
| if (alternate_after != NULL) { |
| byte *encode_pc; |
| insert_let_go_cleanup(dcontext, pc, &ilist, alt_decision, |
| assume_xsp, assume_not_on_dstack, action_after); |
| /* alternate after cleanup target */ |
| /* if alt_after_tgt_p != NULL we always do pointer-sized even if |
| * the initial target happens to reach |
| */ |
| /* we assert below we're < PAGE_SIZE for reachability test */ |
| encode_pc = (alt_after_tgt_p != NULL) ? vmcode_unreachable_pc() : pc; |
| IF_DEBUG(direct = ) |
| insert_reachable_cti(dcontext, &ilist, NULL, encode_pc, |
| alternate_after, true/*jmp*/, false/*!precise*/, |
| DR_REG_NULL/*no scratch*/, &alt_after); |
| ASSERT(alt_after_tgt_p == NULL || !direct); |
| } |
| /* the normal let_go target */ |
| insert_let_go_cleanup(dcontext, pc, &ilist, decision, |
| assume_xsp, assume_not_on_dstack, action_after); |
| } |
| |
| /* now encode the instructions */ |
| /* must set note fields first with offset */ |
| len = 0; |
| push_pc = NULL; |
| for (inst = instrlist_first(&ilist); inst; inst = instr_get_next(inst)) { |
| instr_set_note(inst, (void *)(ptr_int_t)len); |
| len += instr_length(dcontext, inst); |
| } |
| start_pc = pc; |
| for (inst = instrlist_first(&ilist); inst; inst = instr_get_next(inst)) { |
| pc = instr_encode(dcontext, inst, pc); |
| ASSERT(pc != NULL); |
| if (inst == push_start) |
| push_pc = (pc - sizeof(ptr_uint_t)); |
| if (inst == push_start2) |
| push_pc2 = (pc - sizeof(ptr_uint_t)); |
| if (inst == alt_after && alt_after_tgt_p != NULL) |
| *alt_after_tgt_p = pc - sizeof(alternate_after); |
| } |
| |
| /* now can point start_pc arg of callee at beyond-cleanup pc */ |
| if (action_after == AFTER_INTERCEPT_TAKE_OVER_SINGLE_SHOT) { |
| /* Note the interface here allows any target. Yet as the name |
| * suggests it should mainly be used to directly transfer to |
| * the now restored trampoline target. |
| */ |
| ASSERT(alternate_after != NULL); |
| no_cleanup = alternate_after; |
| } else { |
| /* callers are supposed to append the original target prefix */ |
| no_cleanup = pc; |
| } |
| |
| ASSERT(push_pc != NULL); |
| *((ptr_uint_t*)push_pc) = (ptr_uint_t)no_cleanup; |
| if (push_pc2 != NULL) |
| *((ptr_uint_t*)push_pc2) = (ptr_uint_t)no_cleanup; |
| |
| ASSERT(pc - start_pc < PAGE_SIZE && "adjust REL32_REACHABLE for alternate_after"); |
| |
| /* free the instrlist_t elements */ |
| instrlist_clear(dcontext, &ilist); |
| |
| return pc; |
| } |
| #undef APP |
| |
| static void |
| map_intercept_pc_to_app_pc(byte *interception_pc, app_pc original_app_pc, |
| size_t displace_length, size_t orig_length) |
| { |
| intercept_map_elem_t *elem = HEAP_TYPE_ALLOC |
| (GLOBAL_DCONTEXT, intercept_map_elem_t, ACCT_OTHER, UNPROTECTED); |
| |
| elem->interception_pc = interception_pc; |
| elem->original_app_pc = original_app_pc; |
| elem->displace_length = displace_length; |
| elem->orig_length = orig_length; |
| elem->next = NULL; |
| |
| mutex_lock(&map_intercept_pc_lock); |
| |
| if (intercept_map->head == NULL) { |
| intercept_map->head = elem; |
| intercept_map->tail = elem; |
| } |
| else { |
| intercept_map->tail->next = elem; |
| intercept_map->tail = elem; |
| } |
| |
| mutex_unlock(&map_intercept_pc_lock); |
| } |
| |
| static void |
| unmap_intercept_pc(app_pc original_app_pc) |
| { |
| intercept_map_elem_t *curr, *prev, *next; |
| |
| mutex_lock(&map_intercept_pc_lock); |
| |
| prev = NULL; |
| curr = intercept_map->head; |
| while (curr != NULL) { |
| next = curr->next; |
| if (curr->original_app_pc == original_app_pc) { |
| if (prev != NULL) { |
| prev->next = curr->next; |
| } |
| if (curr == intercept_map->head) { |
| intercept_map->head = curr->next; |
| } |
| if (curr == intercept_map->tail) { |
| intercept_map->tail = prev; |
| } |
| |
| HEAP_TYPE_FREE(GLOBAL_DCONTEXT, curr, intercept_map_elem_t, |
| ACCT_OTHER, UNPROTECTED); |
| /* We don't break b/c we allow multiple entries and in fact |
| * we have multiple today: one for displaced app code and |
| * one for the jmp from interception buffer to landing pad. |
| */ |
| } else |
| prev = curr; |
| curr = next; |
| } |
| |
| mutex_unlock(&map_intercept_pc_lock); |
| } |
| |
| static void |
| free_intercept_list(void) |
| { |
| /* For all regular hooks, un_intercept_call() calls unmap_intercept_pc() |
| * and removes the hook's entry. But syscall wrappers have a target app |
| * pc that's unusual. Rather than store it for each, we just tear |
| * down the whole list. |
| */ |
| intercept_map_elem_t *curr; |
| mutex_lock(&map_intercept_pc_lock); |
| while (intercept_map->head != NULL) { |
| curr = intercept_map->head; |
| intercept_map->head = curr->next; |
| HEAP_TYPE_FREE(GLOBAL_DCONTEXT, curr, intercept_map_elem_t, |
| ACCT_OTHER, UNPROTECTED); |
| } |
| intercept_map->head = NULL; |
| intercept_map->tail = NULL; |
| mutex_unlock(&map_intercept_pc_lock); |
| } |
| |
| /* We assume no mangling of code placed in the interception buffer, |
| * other than re-relativizing ctis. As such, we can uniquely correlate |
| * interception buffer PCs to their original app PCs. |
| * Caller must check that pc is actually in the intercept buffer (or landing |
| * pad displaced app code or jmp back). |
| */ |
| app_pc |
| get_app_pc_from_intercept_pc(byte *pc) |
| { |
| intercept_map_elem_t *iter = intercept_map->head; |
| while (iter != NULL) { |
| byte *start = iter->interception_pc; |
| byte *end = start + iter->displace_length; |
| if (pc >= start && pc < end) { |
| /* include jmp back but map it to instr after displacement */ |
| if ((size_t)(pc - start) > iter->orig_length) |
| return iter->original_app_pc + iter->orig_length; |
| else |
| return iter->original_app_pc + (pc - start); |
| } |
| |
| iter = iter->next; |
| } |
| |
| ASSERT_NOT_REACHED(); |
| return NULL; |
| } |
| |
| bool |
| is_intercepted_app_pc(app_pc pc, byte **interception_pc) |
| { |
| intercept_map_elem_t *iter = intercept_map->head; |
| while (iter != NULL) { |
| /* i#268: respond for any pc not just the first. |
| * FIXME: do we handle app targeting middle of hook? |
| * I'm assuming here that we would not create another |
| * entry for that start and it's ok to not match only start. |
| */ |
| if (pc >= iter->original_app_pc && |
| pc < iter->original_app_pc + iter->orig_length) { |
| /* PR 219351: For syscall trampolines, while building bbs we replace |
| * the jmp and never execute from the displaced app code in the |
| * buffer, so the bb looks normal. FIXME: should we just not add to |
| * the map? For now, better safe than sorry so |
| * get_app_pc_from_intercept_pc will work in case we ever ask about |
| * that displaced app code. |
| */ |
| if (is_syscall_trampoline(iter->interception_pc, NULL)) |
| return false; |
| if (interception_pc != NULL) |
| *interception_pc = iter->interception_pc + (pc - iter->original_app_pc); |
| |
| return true; |
| } |
| |
| iter = iter->next; |
| } |
| |
| return false; |
| } |
| |
| /* Emits a jmp at pc to resume_pc. If pc is in the interception buffer, |
| * adds a map entry from [xl8_start_pc, return value here) to |
| * [app_pc, <same size>). |
| */ |
| static byte * |
| emit_resume_jmp(byte *pc, byte *resume_pc, byte *app_pc, byte *xl8_start_pc) |
| { |
| #ifndef X64 |
| *pc = JMP_REL32_OPCODE; pc++; |
| *((int *)pc) = (int)(resume_pc - pc - 4); |
| pc += 4; /* 4 is the size of the relative offset */ |
| #else |
| *pc = JMP_ABS_IND64_OPCODE; pc++; |
| *pc = JMP_ABS_MEM_IND64_MODRM; pc++; |
| #endif |
| /* We explicitly map rather than having instr_set_translation() and |
| * dr_fragment_app_pc() special-case this jump: longer linear search |
| * in the interception map, but cleaner code. |
| */ |
| if (is_in_interception_buffer(pc) && app_pc != NULL) { |
| ASSERT(xl8_start_pc != NULL); |
| map_intercept_pc_to_app_pc(xl8_start_pc, app_pc, pc - xl8_start_pc, |
| pc - xl8_start_pc); |
| } |
| #ifdef X64 |
| /* 64-bit abs address is placed after the jmp instr., i.e., rip rel is 0. |
| * We can't place it before the jmp as in the case of the landing pad |
| * because there is code in the trampoline immediately preceding this jmp. |
| */ |
| *((int *)pc) = 0; pc += 4; /* 4 here is the rel offset to the lpad entry */ |
| *((byte **)pc) = resume_pc; pc += sizeof(resume_pc); |
| #endif |
| return pc; |
| } |
| |
| /* Redirects code at tgt_pc to jmp to our_pc, which is filled with generated |
| * code to call prof_func and then return to the original code. |
| * Assumes that the original tgt_pc should be unwritable. |
| * The caller is responsible for adding the generated |
| * code at our_pc to the dynamo/executable list(s). |
| * |
| * We assume we're being called either before any threads are created |
| * or while all threads are suspended, as our code-overwriting is not atomic! |
| * The only fix is to switch from code-overwriting to import-table modifying, |
| * which is more complicated, see Richter chap22 for example: and import-table |
| * modifying will not allow arbitrary hook placement of course, which we |
| * support for probes and hot patches. |
| * |
| * We guarantee to use a 5-byte jump instruction, even on x64 (PR 250294: we |
| * sometimes have to allocate nearby landing pads there. See PR 245169 for all |
| * of the possibilities for x64 hooking, all of which are either very large or |
| * have caveats; we decided that allocating several 64K chunks and sticking w/ |
| * 5-byte jumps was the cleanest). It is up to the caller to ensure that we |
| * aren't crossing a cti target point and that displacing these 5 bytes is safe |
| * (though we will take care of re-relativizing the displaced code)). |
| * |
| * When cti_safe_to_ignore true, we expect to restore the code |
| * immediately after hitting our trampoline then we can treat the |
| * first 5 bytes as raw. Otherwise, we may need to PC-relativize or |
| * deal with conflicting hookers (case 2525). Assuming a CTI in the |
| * target is a good sign for hookers, we may decide to treat that |
| * specially based on DYNAMO_OPTION(hook_conflict) or we can give up |
| * and not intercept this call when abort_on_incompatible_hooker is |
| * true. |
| * FIXME: if we add one more flag we should switch to a single flag enum |
| * |
| * Currently only hotp_only uses app_code_copy_p and alt_exit_tgt_p. |
| * These point at their respective locations. alt_exit_tgt_p is |
| * currently NOT aligned for hot patching. |
| * |
| * Returns pc after last instruction of emitted interception code, |
| * or NULL when abort_on_incompatible_hooker is true and tgt_pc starts with a CTI. |
| */ |
| static byte * |
| intercept_call(byte *our_pc, byte *tgt_pc, intercept_function_t prof_func, |
| void *callee_arg, bool assume_xsp, after_intercept_action_t action_after, |
| bool abort_on_incompatible_hooker, |
| bool cti_safe_to_ignore, |
| byte **app_code_copy_p, |
| byte **alt_exit_tgt_p) |
| { |
| byte *pc, *our_pc_end, *lpad_start, *lpad_pc, *displaced_app_pc; |
| size_t size; |
| instrlist_t ilist; |
| instr_t *instr; |
| bool changed_prot; |
| dcontext_t *dcontext = get_thread_private_dcontext(); |
| bool is_hooked = false; |
| bool ok; |
| |
| if (dcontext == NULL) |
| dcontext = GLOBAL_DCONTEXT; |
| |
| ASSERT(tgt_pc != NULL); |
| /* can't detect hookers if ignoring CTIs */ |
| ASSERT(!abort_on_incompatible_hooker || !cti_safe_to_ignore); |
| |
| /* we need 5 bytes for a jump |
| * find instr boundary >= 5 bytes after pc |
| */ |
| LOG(GLOBAL, LOG_ASYNCH, 3, "before intercepting:\n"); |
| instrlist_init(&ilist); |
| pc = tgt_pc; |
| do { |
| app_pc next_pc; |
| DOLOG(3, LOG_ASYNCH, { |
| disassemble_with_bytes(dcontext, pc, main_logfile); |
| }); |
| instr = instr_create(dcontext); |
| next_pc = decode_cti(dcontext, pc, instr); |
| ASSERT(instr_valid(instr)); |
| instrlist_append(&ilist, instr); |
| |
| /* we do not handle control transfer instructions very well here! (case 2525) */ |
| if (instr_opcode_valid(instr) && instr_is_cti(instr)) { |
| /* allow for only a single cti at first instruction, |
| * |
| * unless CTIs are safe to ignore since never actually |
| * re-relativized (case 4086 == once-only so don't execute copy) |
| */ |
| ASSERT(!is_hooked); |
| ASSERT(tgt_pc == pc || cti_safe_to_ignore); |
| if (!cti_safe_to_ignore) { |
| /* we treat this as a sign of a third party hooking before us */ |
| is_hooked = true; |
| } |
| } |
| |
| pc = next_pc; |
| |
| /* some of our trampolines are best effort anyways: LdrLoadDll |
| * shouldn't matter much, yet we like to keep it when we can |
| */ |
| if (is_hooked && abort_on_incompatible_hooker) { |
| SYSLOG_INTERNAL_WARNING_ONCE("giving up interception: "PFX" already hooked\n", |
| tgt_pc); |
| LOG(GLOBAL, LOG_ASYNCH, 1, "intercept_call: giving up "PFX" already hooked\n", tgt_pc); |
| instrlist_clear(dcontext, &ilist); |
| return NULL; |
| } |
| |
| if (pc == NULL || is_hooked && DYNAMO_OPTION(hook_conflict) == HOOKED_TRAMPOLINE_DIE) { |
| FATAL_USAGE_ERROR(TAMPERED_NTDLL, 2, get_application_name(), get_application_pid()); |
| } |
| |
| size = (pc - tgt_pc); |
| } while (size < 5); |
| |
| pc = our_pc; |
| |
| if (is_hooked && DYNAMO_OPTION(hook_conflict) == HOOKED_TRAMPOLINE_SQUASH) { |
| /* squash over original with expected code, so that both |
| * copies we make later (one for actual execution and one for |
| * uninterception) have the supposedly original values |
| * see use in intercept_syscall_wrapper() |
| */ |
| /* FIXME: it is not easy to get the correct original bytes |
| * probably best solution is to read from the original |
| * ntdll.dll on disk. To avoid having to deal with RVA disk |
| * to virtual address transformations, it may be even easier |
| * to call LdrLoadDll with a different path to a load a |
| * pristine copy e.g. \\?C:\WINNT\system32\ntdll.dll |
| */ |
| /* FIXME: even if we detach we don't restore the original |
| * values, since what we have here should be good enough |
| */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| } |
| |
| /* Store 1st 5 bytes of original code at start of our code |
| * (won't be executed, original code will jump to after it) |
| * We do this for convenience of un-intercepting, so we don't have to |
| * record offset of the copy in the middle of the interception code |
| * CAUTION: storing the exact copy of the 5 bytes from the app image at |
| * the start of the trampoline is assumed in hotp_only for |
| * case 7279 - change only in synch. |
| */ |
| memcpy(pc, tgt_pc, 5); |
| pc += 5; |
| |
| /* Allocate the landing pad, store its address (4 bytes in 32-bit builds |
| * and 8 in 64-bit ones) in the trampoline, just after the original app |
| * code, and emit it. |
| */ |
| lpad_start = alloc_landing_pad(tgt_pc); |
| memcpy(pc, &lpad_start, sizeof(lpad_start)); |
| pc += sizeof(lpad_start); |
| |
| if (alt_exit_tgt_p != NULL) { |
| /* XXX: if we wanted to align for hot-patching we'd do so here |
| * and we'd pass the (post-padding) pc here as the alternate_after |
| * to emit_intercept_code |
| */ |
| } |
| |
| lpad_pc = lpad_start; |
| lpad_pc = emit_landing_pad_code(lpad_pc, pc, tgt_pc + size, |
| size, &displaced_app_pc, &changed_prot); |
| |
| pc = emit_intercept_code(dcontext, pc, prof_func, callee_arg, |
| assume_xsp, assume_xsp, action_after, |
| (action_after == |
| AFTER_INTERCEPT_TAKE_OVER_SINGLE_SHOT) ? |
| tgt_pc : |
| ((alt_exit_tgt_p != NULL) ? |
| CURRENTLY_UNKNOWN : |
| NULL), |
| alt_exit_tgt_p); |
| |
| /* If we are TAKE_OVER_SINGLE_SHOT then the handler routine has promised to |
| * restore the original code and supply the appropriate continuation address. |
| * As such there is no need for us to copy the code here as we will never use it. |
| * (Note not copying the code also gives us a quick fix for the Vista image entry |
| * problem in PR 293452 from not yet handling non-reaching cits in hook displaced |
| * code PR 268988). FIXME - not having a displaced copy to decode breaks the |
| * redirection deoode_as_bb() (but not other deocde routines) uses to hide the |
| * hook from the client (see PR 293465 for other reasons we need a better solution |
| * to that problem). */ |
| if (action_after != AFTER_INTERCEPT_TAKE_OVER_SINGLE_SHOT) { |
| /* Map displaced code to original app PCs */ |
| map_intercept_pc_to_app_pc |
| (displaced_app_pc, tgt_pc, size + JMP_LONG_LENGTH /* include jmp back */, |
| size); |
| |
| /* Copy original instructions to our version, re-relativizing where necessary */ |
| if (app_code_copy_p != NULL) |
| *app_code_copy_p = displaced_app_pc; |
| copy_app_code(dcontext, tgt_pc, displaced_app_pc, size, &ilist); |
| } else { |
| /* single shot hooks shouldn't need a copy of the app code */ |
| ASSERT(app_code_copy_p == NULL); |
| } |
| |
| finalize_landing_pad_code(lpad_start, changed_prot); |
| |
| /* free the instrlist_t elements */ |
| instrlist_clear(dcontext, &ilist); |
| |
| if (is_hooked) { |
| if (DYNAMO_OPTION(hook_conflict) == HOOKED_TRAMPOLINE_CHAIN) { |
| /* we only have to rerelativize rel32, yet indirect |
| * branches can also be used by hookers, in which case we |
| * don't need to do anything special when copying as bytes |
| */ |
| |
| /* FIXME: now re-relativize at target location */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| ASSERT_NOT_TESTED(); |
| } |
| } |
| |
| /* Must return to the displaced app code in the landing pad */ |
| pc = emit_resume_jmp(pc, displaced_app_pc, tgt_pc, pc); |
| our_pc_end = pc; |
| |
| /* Replace original code with jmp to our version (after 5-byte backup) */ |
| /* copy-on-write will give us a copy of this page */ |
| ok = make_hookable(tgt_pc, JMP_REL32_SIZE, &changed_prot); |
| if (!ok) { |
| /* FIXME: we fail to insert our hook but for now it is easier |
| * to pretend that we succeeded. */ |
| /* should really return NULL and have callers handle this better */ |
| return our_pc_end; |
| } |
| pc = tgt_pc; |
| *pc = JMP_REL32_OPCODE; pc++; |
| IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int(lpad_pc - pc - 4))); |
| *((int *)pc) = (int)(ptr_int_t)(lpad_pc - pc - 4); |
| /* make our page unwritable now */ |
| make_unhookable(tgt_pc, JMP_REL32_SIZE, changed_prot); |
| |
| ASSERT(our_pc_end != NULL); |
| return our_pc_end; |
| } |
| |
| /* Assumes that tgt_pc should be unwritable. Handles hooks with or without |
| * a landing pad. our_pc is the displaced app code to copy to tgt_pc. |
| */ |
| static void |
| un_intercept_call(byte *our_pc, byte *tgt_pc) |
| { |
| bool changed_prot; |
| bool ok; |
| byte *lpad_entry; |
| /* if intercept_call() has failed we shouldn't be un-intercepting */ |
| if (our_pc == NULL) |
| return; |
| |
| lpad_entry = (tgt_pc + JMP_REL32_SIZE) + *((int *)(tgt_pc + 1)); |
| |
| /* restore 1st 5 bytes of original code */ |
| ok = make_hookable(tgt_pc, JMP_REL32_SIZE, &changed_prot); |
| /* if we were able to hook we can't fail on unhook */ |
| ASSERT(ok || memcmp(tgt_pc, our_pc, JMP_REL32_SIZE) == 0 /* hook wasn't applied */); |
| if (!ok) { |
| return; |
| } |
| ASSERT(memcmp(tgt_pc, our_pc, JMP_REL32_SIZE) != 0 /* hook was applied */); |
| memcpy(tgt_pc, our_pc, JMP_REL32_SIZE); |
| make_unhookable(tgt_pc, JMP_REL32_SIZE, changed_prot); |
| |
| /* Redirect the first jump in the landing pad to the hooked address (which we just |
| * restored above) - in case someone has chained with our hook. |
| */ |
| ok = make_hookable(lpad_entry, JMP_SIZE, &changed_prot); |
| ASSERT(ok); |
| if (ok) { |
| /* patch jmp to go back to target */ |
| /* Note - not a hot_patch, caller must have synchronized already to make the |
| * memcpy restore above safe. */ |
| /* FIXME: this looks wrong for x64 which uses abs jmp */ |
| insert_relative_target(lpad_entry+1, tgt_pc, false /* not a hotpatch */); |
| make_unhookable(lpad_entry, JMP_SIZE, changed_prot); |
| } |
| |
| DOLOG(3, LOG_ASYNCH, { |
| byte *pc = tgt_pc; |
| LOG(GLOBAL, LOG_ASYNCH, 3, "after un-intercepting:\n"); |
| do { |
| /* Use GLOBAL_DCONTEXT here since we may have already |
| * called dynamo_thread_exit() |
| */ |
| pc = disassemble_with_bytes(GLOBAL_DCONTEXT, pc, main_logfile); |
| } while (pc < tgt_pc + JMP_REL32_SIZE); |
| }); |
| |
| unmap_intercept_pc((app_pc)tgt_pc); |
| } |
| |
| /* Returns the syscall wrapper at nt_wrapper to a pristine (unhooked) state. Currently |
| * used for -clean_testalert to block the problematic injection of SpywareDoctor (9288) |
| * and similar apps. Returns true if syscall wrapper required cleaning */ |
| /* FIXME - use this for our hook conflict squash policy in intercept_syscall_wrapper as |
| * this can handle more complicated hooks. */ |
| static bool |
| clean_syscall_wrapper(byte *nt_wrapper, int sys_enum) |
| { |
| dcontext_t *dcontext = GLOBAL_DCONTEXT; |
| instr_t *instr_new, *instr_old = instr_create(dcontext); |
| instrlist_t *ilist = instrlist_create(dcontext); |
| app_pc pc = nt_wrapper; |
| bool hooked = false; |
| int sysnum = syscalls[sys_enum]; |
| uint arg_bytes = syscall_argsz[sys_enum]; |
| |
| if (nt_wrapper == NULL || sysnum == SYSCALL_NOT_PRESENT) |
| goto exit_clean_syscall_wrapper; |
| |
| /* syscall wrapper should look like |
| * For NT/2000 |
| * mov eax, sysnum {5 bytes} |
| * lea edx, [esp+4] {4 bytes} |
| * int 2e {2 bytes} |
| * ret arg_bytes {1 byte (0 args) or 3 bytes} |
| * |
| * For XPsp0/XPsp1/2003sp0 |
| * mov eax, sysnum {5 bytes} |
| * mov edx, VSYSCALL_ADDR {5 bytes} |
| * call edx {2 bytes} |
| * ret arg_bytes {1 byte (0 args) or 3 bytes} |
| * |
| * For XPsp2/2003sp1/Vista |
| * mov eax, sysnum {5 bytes} |
| * mov edx, VSYSCALL_ADDR {5 bytes} |
| * call [edx] {2 bytes} |
| * ret arg_bytes {1 byte (0 args) or 3 bytes} |
| * |
| * For WOW64 (case 3922), there are two types: if setting ecx to 0, xor is used. |
| * mov eax, sysnum {5 bytes} |
| * mov ecx, wow_index {5 bytes} --OR-- xor ecx,ecx {2 bytes} |
| * lea edx, [esp+4] {4 bytes} |
| * call fs:0xc0 {7 bytes} |
| * On Win7 WOW64 after the call we have an add: |
| * add esp,0x4 {3 bytes} |
| * ret arg_bytes {1 byte (0 args) or 3 bytes} |
| * On Win8 WOW64 we have no ecx (and no post-syscall add): |
| * 777311bc b844000100 mov eax,10044h |
| * 777311c1 64ff15c0000000 call dword ptr fs:[0C0h] |
| * 777311c8 c3 ret |
| * |
| * For win8 sysenter we have a co-located "inlined" callee: |
| * 77d7422c b801000000 mov eax,1 |
| * 77d74231 e801000000 call ntdll!NtYieldExecution+0xb (77d74237) |
| * 77d74236 c3 ret |
| * 77d74237 8bd4 mov edx,esp |
| * 77d74239 0f34 sysenter |
| * 77d7423b c3 ret |
| * But we instead do the equivalent call to KiFastSystemCall. |
| * |
| * x64 syscall (PR 215398): |
| * mov r10, rcx {3 bytes} |
| * mov eax, sysnum {5 bytes} |
| * syscall {2 bytes} |
| * ret {1 byte} |
| */ |
| |
| /* build correct instr list */ |
| #define APP(list, inst) instrlist_append((list), (inst)) |
| #ifdef X64 |
| APP(ilist, INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_R10), |
| opnd_create_reg(REG_RCX))); |
| APP(ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_EAX), |
| OPND_CREATE_INT32(sysnum))); |
| APP(ilist, INSTR_CREATE_syscall(dcontext)); |
| APP(ilist, INSTR_CREATE_ret(dcontext)); |
| #else |
| APP(ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_EAX), |
| opnd_create_immed_int(sysnum, OPSZ_4))); |
| /* NOTE - the structure of the wrapper depends only on the OS version, not on the |
| * syscall method (for ex. using int on XPsp2 just changes the target on the |
| * vsyscall page, not the wrapper layout). */ |
| if (get_os_version() <= WINDOWS_VERSION_2000) { |
| APP(ilist, |
| INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XDX), |
| opnd_create_base_disp(REG_XSP, REG_NULL, 0, 4, OPSZ_0))); |
| APP(ilist, INSTR_CREATE_int(dcontext, opnd_create_immed_int(0x2e, OPSZ_1))); |
| } else if (is_wow64_process(NT_CURRENT_PROCESS)) { |
| ASSERT(get_syscall_method() == SYSCALL_METHOD_WOW64); |
| if (syscall_uses_wow64_index()) { |
| ASSERT(wow64_index != NULL); |
| ASSERT(wow64_index[sys_enum] != SYSCALL_NOT_PRESENT); |
| if (wow64_index[sys_enum] == 0) { |
| APP(ilist, INSTR_CREATE_xor(dcontext, opnd_create_reg(REG_XCX), |
| opnd_create_reg(REG_XCX))); |
| } else { |
| APP(ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XCX), |
| OPND_CREATE_INT32(wow64_index[sys_enum]))); |
| } |
| APP(ilist, |
| INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XDX), |
| opnd_create_base_disp(REG_XSP, REG_NULL, 0, 4, OPSZ_0))); |
| } |
| APP(ilist, create_syscall_instr(dcontext)); |
| } else { /* XP or greater */ |
| if (get_os_version() >= WINDOWS_VERSION_8) { |
| /* Win8 does not use ind calls: it calls to a local copy of KiFastSystemCall. |
| * We do the next best thing. |
| */ |
| ASSERT(KiFastSystemCall != NULL); |
| APP(ilist, INSTR_CREATE_call(dcontext, opnd_create_pc(KiFastSystemCall))); |
| } else { |
| APP(ilist, |
| INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XDX), |
| OPND_CREATE_INTPTR((ptr_int_t) |
| VSYSCALL_BOOTSTRAP_ADDR))); |
| |
| if (use_ki_syscall_routines()) { |
| /* call through vsyscall addr to Ki*SystemCall routine */ |
| APP(ilist, |
| INSTR_CREATE_call_ind(dcontext, opnd_create_base_disp |
| (REG_XDX, REG_NULL, 0, 0, OPSZ_4_short2))); |
| } else { |
| /* call to vsyscall addr */ |
| APP(ilist, INSTR_CREATE_call_ind(dcontext, opnd_create_reg(REG_XDX))); |
| } |
| } |
| } |
| if (is_wow64_process(NT_CURRENT_PROCESS) && get_os_version() == WINDOWS_VERSION_7) { |
| APP(ilist, |
| INSTR_CREATE_add(dcontext, opnd_create_reg(REG_XSP), OPND_CREATE_INT8(4))); |
| } |
| |
| if (arg_bytes == 0) { |
| APP(ilist, INSTR_CREATE_ret(dcontext)); |
| } else { |
| APP(ilist, |
| INSTR_CREATE_ret_imm(dcontext, opnd_create_immed_int(arg_bytes, OPSZ_1))); |
| } |
| #endif /* X64 */ |
| #undef APP |
| |
| /* we've seen 3 different ways of hooking syscall wrappers : |
| * 1) jmp overwriting first 5 bytes (mov eax, sysnum), most common. |
| * 2) jmp overwriting second 5 bytes (certain versions of Sygate) |
| * 3) overwriting first 8 bytes with push eax (x3) then jmp (Spyware Doctor 9288, A^2 |
| * anti-spyware 10414). */ |
| |
| /* NOTE - we could finish the walk whether hooked or not, but not much point and |
| * I don't fully trust are decode routine w/ junk input (if for ex. hook doesn't end |
| * on an instr boundary). */ |
| for (instr_new = instrlist_first(ilist); instr_new != NULL; |
| instr_new = instr_get_next(instr_new)) { |
| instr_reset(dcontext, instr_old); |
| pc = decode(dcontext, pc, instr_old); |
| if (!instr_same(instr_new, instr_old) && |
| /* don't consider call to KiFastSystemCall vs inlined sysenter to be a hook */ |
| !(get_os_version() >= WINDOWS_VERSION_8 && |
| instr_get_opcode(instr_new) == instr_get_opcode(instr_old) && |
| instr_get_opcode(instr_new) == OP_call)) { |
| /* We haven't seen hookers where the opcode would match, so in that case |
| * seems likely could be our fault (got an immed wrong or something). */ |
| ASSERT_CURIOSITY(instr_get_opcode(instr_new) != instr_get_opcode(instr_old)); |
| /* we haven't seen any hook start deeper then the 2nd instruction */ |
| ASSERT_CURIOSITY(instr_new == instrlist_first(ilist) || |
| instr_new == instr_get_next(instrlist_first(ilist))); |
| hooked = true; |
| break; |
| } |
| } |
| |
| LOG(GLOBAL, LOG_SYSCALLS, hooked ? 1U : 2U, |
| "Syscall wrapper @ "PFX" syscall_num=0x%03x%s hooked.\n", |
| nt_wrapper, sysnum, hooked ? "" : " not"); |
| |
| if (hooked) { |
| bool changed_prot; |
| int length = 0, encode_length; |
| byte *nxt_pc; |
| instr_t *in; |
| |
| SYSLOG_INTERNAL_WARNING_ONCE("Cleaning hooked Nt wrapper @"PFX" sysnum=0x%03x", |
| nt_wrapper, sysnum); |
| for (in = instrlist_first(ilist); in != NULL; in = instr_get_next(in)) |
| length += instr_length(dcontext, in); |
| DOLOG(1, LOG_SYSCALLS, { |
| LOG(GLOBAL, LOG_SYSCALLS, 1, "Replacing hooked wrapper :\n"); |
| pc = nt_wrapper; |
| /* Note - we may disassemble junk here (if hook doesn't end on instr |
| * boundary) but our decode routines should handle it; is debug anyways. */ |
| while (pc - nt_wrapper < length) |
| pc = disassemble_with_bytes(dcontext, pc, GLOBAL); |
| LOG(GLOBAL, LOG_SYSCALLS, 1, "With :\n"); |
| instrlist_disassemble(dcontext, nt_wrapper, ilist, GLOBAL); |
| }); |
| |
| make_hookable(nt_wrapper, length, &changed_prot); |
| nxt_pc = instrlist_encode(dcontext, ilist, nt_wrapper, |
| false /* no jmp targets */); |
| ASSERT(nxt_pc != NULL); |
| encode_length = (int) (nxt_pc - nt_wrapper); |
| ASSERT(encode_length == length && "clean syscall encoded length mismatch"); |
| make_unhookable(nt_wrapper, length, changed_prot); |
| |
| DOLOG(1, LOG_SYSCALLS, { |
| LOG(GLOBAL, LOG_SYSCALLS, 1, "Cleaned wrapper is now :\n"); |
| pc = nt_wrapper; |
| while (pc - nt_wrapper < length) |
| pc = disassemble_with_bytes(dcontext, pc, GLOBAL); |
| }); |
| } |
| |
| exit_clean_syscall_wrapper: |
| instr_destroy(dcontext, instr_old); |
| instrlist_clear_and_destroy(dcontext, ilist); |
| return hooked; |
| } |
| |
| /* Inserts a trampoline in a system call wrapper. |
| * All uses should end up using dstack -- else watch out for initstack |
| * infinite loop (see comment above). |
| * Returns in skip_syscall_pc the native pc for skipping the system call altogether. |
| * |
| * Since the only safe point is the first instr, and not right at the syscall |
| * instr itself (no 5-byte spot there), we have to copy the whole series of app |
| * instrs up until the syscall instr into our buffer to be executed prior to the |
| * callee. This means any intercepted syscall from the cache will have that |
| * sequence run NATIVELY! A solution is to set a flag to go back to native |
| * after the next syscall, and take over right away, but a little more worrisome |
| * than only executing the syscall under DR in terms of potential to miss the |
| * re-native trigger. |
| * |
| * For x64, we still use a 5-byte jump, assuming our main heap is within 2GB of |
| * ntdll.dll (xref PR 215395); if not we'll need an auxiliary landing pad |
| * trampoline within 2GB (xref PR 250294 where we need to support such |
| * trampolines for general hooks). Also xref PR 245169 on x64 hooking |
| * possibilities, none of which is ideal. |
| * |
| * FIXME: other interception ideas: could do at instr after mov-immed, |
| * and arrange own int 2e for win2k, and emulate rest of sequence when |
| * handling syscall from handler -- this would eliminate some issues |
| * with the pre-syscall sequence copy, but not clear if better overall. |
| * Would be nice to have a single shared syscall handler, but since |
| * wrappers are stdcall that would be difficult. |
| * |
| * We allow the callee to execute the syscall itself, and by returning |
| * AFTER_INTERCEPT_LET_GO_ALT_DYN, it signals to skip the actual syscall, |
| * so we have control returned to the instr after the syscall instr. |
| * For AFTER_INTERCEPT_LET_GO or AFTER_INTERCEPT_TAKE_OVER, the syscall |
| * instr itself is the next instr to be executed. |
| * |
| * N.B.: this routine makes assumptions about the exact sequence of instrs in |
| * syscall wrappers, in particular that the indirect call to the vsyscall page |
| * can be turned into a direct call, which is only safe for XP SP2 if the |
| * vsyscall page is not writable, and cannot be made writable, which is what we |
| * have observed to be true. |
| */ |
| |
| /* Helper function that returns the after-hook pc */ |
| static byte * |
| syscall_wrapper_ilist(dcontext_t *dcontext, |
| instrlist_t *ilist, /* IN/OUT */ |
| byte **ptgt_pc /* IN/OUT */, |
| void *callee_arg, |
| byte *fpo_stack_adjustment, /* OUT OPTIONAL */ |
| byte **ret_pc /* OUT */, |
| const char *name) |
| { |
| byte *pc, *after_hook_target = NULL; |
| byte *after_mov_immed; |
| instr_t *instr, *hook_return_instr = NULL; |
| int opcode = OP_UNDECODED; |
| int sys_enum = (int)(ptr_uint_t)callee_arg; |
| int native_sys_num = syscalls[sys_enum]; |
| |
| pc = *ptgt_pc; |
| /* we need 5 bytes for a jump, and we assume that the first instr |
| * (2nd instr for x64, where we skip the 1st) is a 5-byte mov immed! |
| */ |
| instr = instr_create(dcontext); |
| pc = decode(dcontext, pc, instr); |
| after_mov_immed = pc; |
| /* FIXME: handle other hookers gracefully by chaining! |
| * Note that moving trampoline point 5 bytes in could help here (see above). |
| */ |
| #ifndef X64 |
| ASSERT(instr_length(dcontext, instr) >= 5); |
| #endif |
| if (fpo_stack_adjustment != NULL) |
| *fpo_stack_adjustment = 0; /* for GBOP case 7127 */ |
| |
| if (instr_is_cti(instr)) { |
| /* we only have to rerelativize rel32, yet indirect |
| * branches can also be used by hookers, in which case we |
| * don't need to do anything special when copying as bytes |
| * FIXME: should we still die? |
| */ |
| |
| /* see case 2525 for background discussion */ |
| if (DYNAMO_OPTION(native_exec_hook_conflict) == HOOKED_TRAMPOLINE_DIE) { |
| /* FIXME: we could still print the message but we don't have to kill the app here */ |
| FATAL_USAGE_ERROR(TAMPERED_NTDLL, 2, get_application_name(), |
| get_application_pid()); |
| } else if (DYNAMO_OPTION(native_exec_hook_conflict) == HOOKED_TRAMPOLINE_CHAIN) { |
| /* we assume 5-byte hookers as well - so only need to relativize in our own copy */ |
| /* and we need to introduce a PUSH in case of a CALL here */ |
| |
| ASSERT(instr_get_opcode(instr) != OP_call_ind); |
| if (instr_is_mbr(instr)) { |
| /* one can imagine mbr being used on x64 */ |
| FATAL_USAGE_ERROR(TAMPERED_NTDLL, 2, get_application_name(), |
| get_application_pid()); |
| } |
| if (instr_get_opcode(instr) == OP_call) { |
| LOG(GLOBAL, LOG_ASYNCH, 2, |
| "intercept_syscall_wrapper: mangling hooked call at "PFX"\n", pc); |
| /* replace the call w/ a push/jmp hoping this will |
| * eventually return to us unless the hooker decides |
| * to squash the system call or execute without going |
| * back here. |
| * FIXME: keep in mind the code on the instrlist is executed natively |
| */ |
| insert_push_immed_ptrsz(dcontext, (ptr_int_t)pc, ilist, NULL, |
| NULL, NULL); |
| #ifdef X64 |
| /* check reachability from new location */ |
| /* allow interception code to be up to a page: don't bother |
| * to calculate exactly where our jmp will be encoded */ |
| if (!REL32_REACHABLE(interception_cur_pc, |
| opnd_get_pc(instr_get_target(instr))) || |
| !REL32_REACHABLE(interception_cur_pc + PAGE_SIZE, |
| opnd_get_pc(instr_get_target(instr)))) { |
| FATAL_USAGE_ERROR(TAMPERED_NTDLL, 2, get_application_name(), |
| get_application_pid()); |
| } |
| #endif |
| instrlist_append(ilist, |
| INSTR_CREATE_jmp(dcontext, |
| opnd_create_pc(opnd_get_pc(instr_get_target(instr))))); |
| /* skip original instruction */ |
| instr_destroy(dcontext, instr); |
| /* interp still needs to be updated */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| } else if (instr_get_opcode(instr) == OP_jmp) { |
| /* FIXME - no good way to regain control after the hook */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| LOG(GLOBAL, LOG_ASYNCH, 2, "intercept_syscall_wrapper: hooked with jmp "PFX"\n", pc); |
| /* just append instruction as is */ |
| instrlist_append(ilist, instr); |
| } else { |
| ASSERT_NOT_IMPLEMENTED(false && "unchainable CTI"); |
| /* FIXME PR 215397: need to re-relativize pc-relative memory reference */ |
| IF_X64(ASSERT_NOT_IMPLEMENTED(!instr_has_rel_addr_reference(instr))); |
| /* just append instruction as is, emit re-relativises if necessary */ |
| instrlist_append(ilist, instr); |
| /* FIXME: if instr's length doesn't match normal 1st instr we'll |
| * get off down below: really shouldn't continue here */ |
| } |
| } else if (DYNAMO_OPTION(native_exec_hook_conflict) == HOOKED_TRAMPOLINE_SQUASH) { |
| SYSLOG_INTERNAL_WARNING("intercept_syscall_wrapper: " |
| "squashing hook in %s @"PFX, name, pc); |
| LOG(GLOBAL, LOG_ASYNCH, 2, |
| "intercept_syscall_wrapper: squashing hooked syscall %s %02x at "PFX"\n", |
| name, native_sys_num, pc); |
| #ifdef X64 |
| /* in this case we put our hook at the 1st instr */ |
| instrlist_append(ilist, |
| INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_R10), |
| opnd_create_reg(REG_RCX))); |
| #endif |
| /* we normally ASSERT that 1st instr is always mov imm -> eax */ |
| instrlist_append(ilist, |
| INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_EAX), |
| OPND_CREATE_INT32(native_sys_num))); |
| /* FIXME: even if we detach we don't restore the original |
| * values, since what we have here should be good enough |
| */ |
| /* skip original instruction */ |
| instr_destroy(dcontext, instr); |
| } else if (DYNAMO_OPTION(native_exec_hook_conflict) == HOOKED_TRAMPOLINE_HOOK_DEEPER) { |
| /* move our hook one instruction deeper assuming hooker will |
| * return to right after the hook, verify that's an |
| * instruction boundary */ |
| #ifdef X64 |
| /* not much room for two hooks before the syscall; we don't support |
| * for now */ |
| ASSERT_NOT_REACHED(); |
| FATAL_USAGE_ERROR(TAMPERED_NTDLL, 2, get_application_name(), |
| get_application_pid()); |
| #else |
| ASSERT(instr_length(dcontext, instr) == 5 /* length of normal mov_imm */); |
| *ptgt_pc = pc; |
| /* skip original instruction */ |
| instr_destroy(dcontext, instr); |
| #endif |
| } else if (DYNAMO_OPTION(native_exec_hook_conflict) == |
| HOOKED_TRAMPOLINE_NO_HOOK) { |
| SYSLOG_INTERNAL_WARNING("intercept_syscall_wrapper: " |
| "not hooking %s due to conflict @"PFX, name, pc); |
| LOG(GLOBAL, LOG_ASYNCH, 2, |
| "intercept_syscall_wrapper: not hooking syscall %s %02x at "PFX"\n", |
| name, native_sys_num, pc); |
| instr_destroy(dcontext, instr); |
| return NULL; |
| } else { |
| ASSERT_NOT_REACHED(); |
| FATAL_USAGE_ERROR(TAMPERED_NTDLL, 2, get_application_name(), |
| get_application_pid()); |
| } |
| } else { |
| #ifdef X64 |
| /* first instr is mov rcx -> r10, which we skip to reach the 5-byte mov immed */ |
| ASSERT(instr_get_opcode(instr) == OP_mov_ld && |
| opnd_is_reg(instr_get_src(instr, 0)) && |
| opnd_get_reg(instr_get_src(instr, 0)) == REG_RCX && |
| opnd_is_reg(instr_get_dst(instr, 0)) && |
| opnd_get_reg(instr_get_dst(instr, 0)) == REG_R10); |
| /* we hook after the 1st instr. will this confuse other hookers who |
| * will think there currently is no hook b/c not on 1st instr? */ |
| *ptgt_pc = pc; |
| instr_destroy(dcontext, instr); |
| /* now decode the 2nd instr which should be a mov immed */ |
| DOLOG(3, LOG_ASYNCH, { disassemble_with_bytes(dcontext, pc, main_logfile); }); |
| instr = instr_create(dcontext); |
| pc = decode(dcontext, pc, instr); |
| ASSERT(instr_length(dcontext, instr) == 5 /* length of normal mov_imm */); |
| opcode = instr_get_opcode(instr); |
| /* now fall through */ |
| #endif |
| /* normally a mov eax, native_sys_num */ |
| ASSERT(instr_get_opcode(instr) == OP_mov_imm); |
| ASSERT(opnd_get_immed_int(instr_get_src(instr, 0)) == native_sys_num); |
| LOG(GLOBAL, LOG_ASYNCH, 3, "intercept_syscall_wrapper: hooked syscall %02x at "PFX"\n", |
| native_sys_num, pc); |
| /* append instruction (non-CTI) */ |
| instrlist_append(ilist, instr); |
| } |
| |
| #ifdef X64 |
| /* 3rd instr: syscall */ |
| instr = instr_create(dcontext); |
| after_hook_target = pc; |
| pc = decode(dcontext, pc, instr); |
| *ret_pc = pc; |
| ASSERT(instr_get_opcode(instr) == OP_syscall); |
| instr_destroy(dcontext, instr); |
| #else |
| if (get_syscall_method() == SYSCALL_METHOD_WOW64 && |
| get_os_version() >= WINDOWS_VERSION_8) { |
| ASSERT(!syscall_uses_wow64_index()); |
| /* second instr is a call*, what we consider the system call instr */ |
| after_hook_target = pc; |
| instr = instr_create(dcontext); |
| *ret_pc = decode(dcontext, pc, instr); /* skip call* to skip syscall */ |
| ASSERT(instr_get_opcode(instr) == OP_call_ind); |
| instr_destroy(dcontext, instr); |
| /* XXX: how handle chrome hooks on win8? (xref i#464) */ |
| } else if (get_syscall_method() == SYSCALL_METHOD_SYSENTER && |
| get_os_version() >= WINDOWS_VERSION_8) { |
| /* Second instr is a call to an inlined routine that calls sysenter. |
| * We treat this in a similar way to call* to sysenter which is handled |
| * down below. |
| * XXX: could share a little bit of code but not much. |
| */ |
| after_hook_target = pc; |
| instr = instr_create(dcontext); |
| *ret_pc = decode(dcontext, pc, instr); /* skip call to skip syscall */ |
| ASSERT(instr_get_opcode(instr) == OP_call); |
| |
| /* replace the call w/ a push */ |
| instrlist_append(ilist, INSTR_CREATE_push_imm |
| (dcontext, OPND_CREATE_INTPTR((ptr_int_t)*ret_pc))); |
| |
| /* the callee, inlined later in wrapper, or KiFastSystemCall */ |
| pc = (byte *) opnd_get_pc(instr_get_target(instr)); |
| |
| /* fourth instr: mov %xsp -> %xdx */ |
| instr_reset(dcontext, instr); /* re-use call container */ |
| pc = decode(dcontext, pc, instr); |
| instrlist_append(ilist, instr); |
| ASSERT(instr_get_opcode(instr) == OP_mov_ld); |
| |
| /* fifth instr: sysenter */ |
| instr = instr_create(dcontext); |
| after_hook_target = pc; |
| pc = decode(dcontext, pc, instr); |
| ASSERT(instr_get_opcode(instr) == OP_sysenter); |
| instr_destroy(dcontext, instr); |
| |
| /* ignore ret after sysenter, we'll return to ret after call */ |
| |
| } else { |
| /* second instr is either a lea, a mov immed, or an xor */ |
| DOLOG(3, LOG_ASYNCH, { disassemble_with_bytes(dcontext, pc, main_logfile); }); |
| instr = instr_create(dcontext); |
| pc = decode(dcontext, pc, instr); |
| instrlist_append(ilist, instr); |
| opcode = instr_get_opcode(instr); |
| } |
| if (after_hook_target != NULL) { |
| /* all set */ |
| } else if (get_syscall_method() == SYSCALL_METHOD_WOW64) { |
| ASSERT(opcode == OP_xor || opcode == OP_mov_imm); |
| /* third instr is a lea */ |
| instr = instr_create(dcontext); |
| pc = decode(dcontext, pc, instr); |
| |
| if (instr_get_opcode(instr) == OP_jmp_ind) { |
| /* Handle chrome hooks (i#464) via targeted handling since these |
| * don't look like any other hooks we've seen. We can generalize if |
| * we later find similar-looking hooks elsewhere. |
| * They look like this: |
| * ntdll!NtMapViewOfSection: |
| * 77aafbe0 b825000000 mov eax,0x25 |
| * 77aafbe5 ba28030a00 mov edx,0xa0328 |
| * 77aafbea ffe2 jmp edx |
| * 77aafbec c215c0 ret 0xc015 |
| * 77aafbef 90 nop |
| * 77aafbf0 0000 add [eax],al |
| * 77aafbf2 83c404 add esp,0x4 |
| * 77aafbf5 c22800 ret 0x28 |
| * We put in the native instrs in our hook so our stuff |
| * operates correctly, and assume the native state change |
| * won't affect the chrome hook code. We resume |
| * right after the 1st mov-imm-eax instr. These are the native |
| * instrs for all chrome hooks in ntdll (Nt{,Un}MapViewOfSection), |
| * which are put in place from the parent, so they're there when we |
| * initialize and aren't affected by -handle_ntdll_modify: |
| * 77aafbe5 33c9 xor ecx,ecx |
| * 77aafbe7 8d542404 lea edx,[esp+0x4] |
| */ |
| instr_t *tmp = instrlist_last(ilist); |
| instrlist_remove(ilist, tmp); |
| instr_destroy(dcontext, tmp); |
| instr_destroy(dcontext, instr); |
| ASSERT(syscall_uses_wow64_index()); /* else handled above */ |
| ASSERT(wow64_index != NULL); |
| if (wow64_index[sys_enum] == 0) { |
| instrlist_append |
| (ilist, INSTR_CREATE_xor |
| (dcontext, opnd_create_reg(REG_XCX), opnd_create_reg(REG_XCX))); |
| } else { |
| instrlist_append |
| (ilist, INSTR_CREATE_mov_imm |
| (dcontext, opnd_create_reg(REG_XCX), |
| OPND_CREATE_INT32(wow64_index[sys_enum]))); |
| } |
| instrlist_append |
| (ilist, INSTR_CREATE_lea |
| (dcontext, opnd_create_reg(REG_XDX), |
| opnd_create_base_disp(REG_XSP, REG_NULL, 0, 0x4, OPSZ_lea))); |
| after_hook_target = after_mov_immed; |
| /* skip chrome hook to skip syscall: target "add esp,0x4" */ |
| # define CHROME_HOOK_DISTANCE_JMP_TO_SKIP 6 |
| *ret_pc = pc + CHROME_HOOK_DISTANCE_JMP_TO_SKIP; |
| DOCHECK(1, { |
| instr = instr_create(dcontext); |
| decode(dcontext, *ret_pc, instr); |
| ASSERT(instr_get_opcode(instr) == OP_add); |
| instr_destroy(dcontext, instr); |
| }); |
| } else { |
| ASSERT(instr_get_opcode(instr) == OP_lea); |
| instrlist_append(ilist, instr); |
| |
| /* fourth instr is a call*, what we consider the system call instr */ |
| after_hook_target = pc; |
| instr = instr_create(dcontext); |
| *ret_pc = decode(dcontext, pc, instr); /* skip call* to skip syscall */ |
| ASSERT(instr_get_opcode(instr) == OP_call_ind); |
| instr_destroy(dcontext, instr); |
| } |
| } else if (opcode == OP_mov_imm) { |
| ptr_int_t immed = opnd_get_immed_int(instr_get_src(instr, 0)); |
| ASSERT(PAGE_START(immed) == (ptr_uint_t) VSYSCALL_PAGE_START_BOOTSTRAP_VALUE); |
| ASSERT(get_syscall_method() == SYSCALL_METHOD_SYSENTER); |
| ASSERT(get_os_version() >= WINDOWS_VERSION_XP); |
| |
| /* third instr is an indirect call */ |
| instr = instr_create(dcontext); |
| pc = decode(dcontext, pc, instr); |
| *ret_pc = pc; |
| ASSERT(instr_get_opcode(instr) == OP_call_ind); |
| if (fpo_stack_adjustment != NULL) { |
| /* for GBOP case 7127 */ |
| *fpo_stack_adjustment = 4; |
| } |
| /* replace the call w/ a push */ |
| instrlist_append(ilist, INSTR_CREATE_push_imm |
| (dcontext, OPND_CREATE_INTPTR((ptr_int_t)pc))); |
| |
| /* the callee, either on vsyscall page or at KiFastSystemCall */ |
| if (opnd_is_reg(instr_get_src(instr, 0))) |
| pc = (byte *) immed; |
| else /* KiFastSystemCall */ |
| pc = *((byte **)immed); |
| |
| /* fourth instr: mov %xsp -> %xdx */ |
| instr_reset(dcontext, instr); /* re-use ind call container */ |
| pc = decode(dcontext, pc, instr); |
| instrlist_append(ilist, instr); |
| ASSERT(instr_get_opcode(instr) == OP_mov_ld); |
| |
| /* fifth instr: sysenter */ |
| instr = instr_create(dcontext); |
| after_hook_target = pc; |
| pc = decode(dcontext, pc, instr); |
| ASSERT(instr_get_opcode(instr) == OP_sysenter); |
| instr_destroy(dcontext, instr); |
| |
| /* ignore ret after sysenter, we'll return to ret after call */ |
| } else { |
| ASSERT(opcode == OP_lea); |
| /* third instr: int 2e */ |
| instr = instr_create(dcontext); |
| *ret_pc = decode(dcontext, pc, instr); |
| ASSERT(instr_get_opcode(instr) == OP_int); |
| /* if we hooked deeper, will need to hook over the int too */ |
| if (pc - *ptgt_pc < 5 /* length of our hook */) { |
| /* Need to add an int 2e to the return path since hook clobbered |
| * the original one. We use create_syscall_instr(dcontext) for |
| * the sygate int fix. FIXME - the pc will now show up as |
| * after_do/share_syscall() but should be ok since anyone |
| * checking for those on this thread should have already checked |
| * for it being native. */ |
| hook_return_instr = create_syscall_instr(dcontext); |
| after_hook_target = *ret_pc; |
| ASSERT(DYNAMO_OPTION(native_exec_hook_conflict) == |
| HOOKED_TRAMPOLINE_HOOK_DEEPER); |
| } else { |
| /* point after_hook_target to int 2e */ |
| after_hook_target = pc; |
| } |
| instr_destroy(dcontext, instr); |
| } |
| #endif |
| return after_hook_target; |
| } |
| |
| byte * |
| intercept_syscall_wrapper(byte **ptgt_pc /* IN/OUT */, |
| intercept_function_t prof_func, |
| void *callee_arg, after_intercept_action_t action_after, |
| app_pc *skip_syscall_pc /* OUT */, |
| byte **orig_bytes_pc /* OUT */, |
| byte *fpo_stack_adjustment /* OUT OPTIONAL */, |
| const char *name) |
| { |
| byte *pc, *emit_pc, *ret_pc = NULL, *after_hook_target = NULL, *tgt_pc; |
| byte *lpad_start, *lpad_pc, *lpad_resume_pc, *xl8_start_pc; |
| instr_t *instr, *hook_return_instr = NULL; |
| instrlist_t ilist; |
| bool changed_prot; |
| dcontext_t *dcontext = get_thread_private_dcontext(); |
| bool ok; |
| if (dcontext == NULL) |
| dcontext = GLOBAL_DCONTEXT; |
| |
| instrlist_init(&ilist); |
| |
| ASSERT(ptgt_pc != NULL && *ptgt_pc != NULL); |
| |
| after_hook_target = syscall_wrapper_ilist(dcontext, &ilist, ptgt_pc, callee_arg, |
| fpo_stack_adjustment, &ret_pc, name); |
| if (after_hook_target == NULL) |
| return NULL; /* aborted */ |
| |
| tgt_pc = *ptgt_pc; |
| pc = tgt_pc; |
| LOG(GLOBAL, LOG_ASYNCH, 3, "%s: before intercepting:\n", __FUNCTION__); |
| DOLOG(3, LOG_ASYNCH, { disassemble_with_bytes(dcontext, pc, main_logfile); }); |
| |
| pc = interception_cur_pc; /* current spot in interception buffer */ |
| |
| /* copy original 5 bytes to ease unhooking, we won't execute this */ |
| *orig_bytes_pc = pc; |
| memcpy(pc, tgt_pc, 5); |
| pc += 5; |
| |
| /* i#901: We need a landing pad b/c ntdll may not be reachable from DR. |
| * However, we do not support rip-rel instrs in the syscall wrapper, as by |
| * keeping the displaced app code in the intercept buffer and not in the |
| * landing pad we can use the standard landing pad layout, the existing |
| * emit_landing_pad_code(), the existing is_syscall_trampoline(), and other |
| * routines, and also keeps the landing pads themselves a constant size and |
| * layout (though the ones here do not have all their space used b/c there's |
| * no displaced app code). |
| */ |
| lpad_start = alloc_landing_pad(tgt_pc); |
| lpad_pc = lpad_start; |
| lpad_pc = emit_landing_pad_code(lpad_pc, pc, after_hook_target, |
| 0/*no displaced code in lpad*/, |
| &lpad_resume_pc, &changed_prot); |
| /* i#1027: map jmp back in landing pad to original app pc. We do this to |
| * have the translation just in case, even though we hide this jmp from the |
| * client. Xref the PR 219351 comment in is_intercepted_app_pc(). |
| */ |
| map_intercept_pc_to_app_pc(lpad_resume_pc, after_hook_target, JMP_LONG_LENGTH, 0); |
| finalize_landing_pad_code(lpad_start, changed_prot); |
| |
| emit_pc = pc; |
| /* we assume that interception buffer is still writable */ |
| |
| /* we need to enter at copy of pre-syscall sequence, since we need |
| * callee to be at app state exactly prior to syscall instr itself. |
| * this means this sequence is executed natively even for syscalls |
| * in the cache (since interception code is run natively) -- only |
| * worry would be stack faults, whose context we might xlate incorrectly |
| * |
| * N.B.: bb_process_ubr() assumes that the target of the trampoline |
| * is the original mov immed! |
| */ |
| |
| /* insert our copy of app instrs leading up to syscall |
| * first instr doubles as the clobbered original code for un-intercepting. |
| */ |
| for (instr = instrlist_first(&ilist); instr != NULL; instr = instr_get_next(instr)) { |
| pc = instr_encode(dcontext, instr, pc); |
| ASSERT(pc != NULL); |
| } |
| instrlist_clear(dcontext, &ilist); |
| |
| pc = emit_intercept_code(dcontext, pc, prof_func, callee_arg, |
| false /*do not assume xsp*/, |
| false /*not known to not be on dstack: ok to clobber flags*/, |
| action_after, |
| ret_pc /* alternate target to skip syscall */, NULL); |
| |
| /* Map interception buffer PCs to original app PCs */ |
| if (is_in_interception_buffer(pc)) |
| map_intercept_pc_to_app_pc(pc, tgt_pc, 10 /* 5 bytes + jmp back */, 5); |
| |
| /* The normal target, for really doing the system call native, used |
| * for letting go normally and for take over. |
| * We already did pre-syscall sequence, so we go straight to syscall itself. |
| */ |
| /* have to include syscall instr here if we ended up hooking over it */ |
| xl8_start_pc = pc; |
| if (hook_return_instr != NULL) { |
| pc = instr_encode(dcontext, hook_return_instr, pc); |
| ASSERT(pc != NULL); |
| instr_destroy(dcontext, hook_return_instr); |
| } |
| pc = emit_resume_jmp(pc, lpad_resume_pc, tgt_pc, xl8_start_pc); |
| |
| /* update interception buffer pc */ |
| interception_cur_pc = pc; |
| |
| /* Replace original code with jmp to our version's entrance */ |
| /* copy-on-write will give us a copy of this page */ |
| ok = make_hookable(tgt_pc, 5, &changed_prot); |
| if (ok) { |
| ptr_int_t offset = (lpad_pc - (tgt_pc + 5)); |
| #ifdef X64 |
| if (!REL32_REACHABLE_OFFS(offset)) { |
| ASSERT_NOT_IMPLEMENTED(false && "PR 245169: hook target too far: NYI"); |
| /* FIXME PR 245169: we need use landing_pad_areas to alloc landing |
| * pads to trampolines, as done for PR 250294. |
| */ |
| } |
| #endif |
| pc = tgt_pc; |
| *pc = JMP_REL32_OPCODE; |
| pc++; |
| IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int(offset))); |
| *((int *)pc) = (int) offset; |
| } |
| /* make our page unwritable now */ |
| make_unhookable(tgt_pc, 5, changed_prot); |
| |
| if (skip_syscall_pc != NULL) |
| *skip_syscall_pc = ret_pc; |
| |
| return emit_pc; |
| } |
| |
| /* two convenience routines for intercepting using the code[] buffer |
| * after the initialization routine has completed |
| * |
| * WARNING: only call this when there is only one thread going! |
| * This is not thread-safe! |
| */ |
| byte * |
| insert_trampoline(byte *tgt_pc, intercept_function_t prof_func, |
| void *callee_arg, bool assume_xsp, after_intercept_action_t action_after, |
| bool cti_safe_to_ignore) |
| { |
| byte *pc = interception_cur_pc; |
| /* make interception code writable, NOTE the interception code may |
| * be in vmareas executable list, we make the interception code temporarily |
| * writable here without removing or flushing the region, this is ok since |
| * we should be single threaded when this function is called and we never |
| * overwrite existing interception code */ |
| DEBUG_DECLARE(bool ok =) |
| make_writable(interception_code, INTERCEPTION_CODE_SIZE); |
| ASSERT(ok); |
| |
| /* FIXME: worry about inserting trampoline across bb boundaries? */ |
| interception_cur_pc = intercept_call(interception_cur_pc, tgt_pc, prof_func, callee_arg, |
| assume_xsp, action_after, |
| false, /* need the trampoline at all costs */ |
| cti_safe_to_ignore, NULL, NULL); |
| /* FIXME: we assume early intercept_call failures are ok to |
| * ignore. Note we may want to crash instead if trying to sandbox |
| * malicious programs that may be able to prevent us from |
| * committing memory. |
| */ |
| |
| ASSERT(interception_cur_pc - interception_code < INTERCEPTION_CODE_SIZE); |
| |
| /* return interception code to read only state */ |
| make_unwritable(interception_code, INTERCEPTION_CODE_SIZE); |
| |
| return pc; |
| } |
| |
| void |
| remove_trampoline(byte *our_pc, byte *tgt_pc) |
| { |
| un_intercept_call(our_pc, tgt_pc); |
| } |
| |
| bool |
| is_in_interception_buffer(byte *pc) |
| { |
| return (pc >= interception_code && |
| pc < interception_code + INTERCEPTION_CODE_SIZE); |
| } |
| |
| bool |
| is_part_of_interception(byte *pc) |
| { |
| return (is_in_interception_buffer(pc) || |
| vmvector_overlap(landing_pad_areas, pc, pc + 1)); |
| } |
| |
| bool |
| is_on_interception_initial_route(byte *pc) |
| { |
| if (vmvector_overlap(landing_pad_areas, pc, pc + 1)) { |
| /* Look for the forward jump. For x64, any ind jmp will do, as reverse |
| * jmp is direct. |
| */ |
| if (IF_X64_ELSE(*pc == JMP_ABS_IND64_OPCODE && |
| *(pc + 1) == JMP_ABS_MEM_IND64_MODRM, |
| *pc == JMP_REL32_OPCODE && |
| is_in_interception_buffer(PC_RELATIVE_TARGET(pc + 1)))) { |
| |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| bool |
| is_syscall_trampoline(byte *pc, byte **tgt) |
| { |
| if (syscall_trampolines_start == NULL) |
| return false; |
| if (vmvector_overlap(landing_pad_areas, pc, pc + 1)) { |
| /* Also count the jmp from landing pad back to syscall instr, which is |
| * immediately after the jmp from landing pad to interception buffer (i#1027). |
| */ |
| app_pc syscall; |
| if (is_jmp_rel32(pc, pc, &syscall) && |
| is_jmp_rel32(pc - JMP_LONG_LENGTH, NULL, NULL)) { |
| dcontext_t *dcontext = get_thread_private_dcontext(); |
| instr_t instr; |
| if (dcontext == NULL) |
| dcontext = GLOBAL_DCONTEXT; |
| instr_init(dcontext, &instr); |
| decode(dcontext, syscall, &instr); |
| if (instr_is_syscall(&instr)) { |
| /* proceed using the 1st jmp */ |
| pc -= JMP_LONG_LENGTH; |
| } |
| instr_free(dcontext, &instr); |
| } |
| #ifdef X64 |
| /* target is 8 bytes back */ |
| pc = *(app_pc *)(pc - sizeof(app_pc)); |
| #else |
| if (!is_jmp_rel32(pc, pc, &pc)) |
| return false; |
| #endif |
| } |
| if (pc >= syscall_trampolines_start && pc < syscall_trampolines_end) { |
| if (tgt != NULL) |
| *tgt = pc; |
| return true; |
| } |
| return false; |
| } |
| |
| /**************************************************************************** |
| */ |
| /* TRACK_NTDLL: try to find where kernel re-emerges into user mode when it |
| * dives into kernel mode |
| */ |
| #if TRACK_NTDLL |
| static byte * |
| make_writable_incr(byte *pc) |
| { |
| PBYTE pb = (PBYTE) pc; |
| MEMORY_BASIC_INFORMATION mbi; |
| DWORD old_prot; |
| int res; |
| |
| res = query_virtual_memory(pb, &mbi, sizeof(mbi)); |
| ASSERT(res == sizeof(mbi)); |
| |
| res = protect_virtual_memory(mbi.BaseAddress, mbi.RegionSize, |
| PAGE_EXECUTE_WRITECOPY, &old_prot); |
| ASSERT(res); |
| return (byte *)((int)mbi.BaseAddress + (int)mbi.RegionSize); |
| } |
| |
| static byte * |
| make_inaccessible(byte *pc) |
| { |
| PBYTE pb = (PBYTE) pc; |
| MEMORY_BASIC_INFORMATION mbi; |
| DWORD old_prot; |
| int res; |
| |
| res = query_virtual_memory(pb, &mbi, sizeof(mbi)); |
| ASSERT(res == sizeof(mbi)); |
| |
| res = protect_virtual_memory(mbi.BaseAddress, mbi.RegionSize, |
| PAGE_NOACCESS, &old_prot); |
| ASSERT(res); |
| return (byte *)((int)mbi.BaseAddress + (int)mbi.RegionSize); |
| } |
| |
| void |
| wipe_out_ntdll() |
| { |
| byte * start = (byte *) 0x77F81000; |
| byte * stop = (byte *) 0x77FCD95B; |
| byte *pc; |
| |
| /* first suspend all other threads */ |
| thread_record_t **threads; |
| int i, num_threads; |
| mutex_lock(&thread_initexit_lock); |
| get_list_of_threads(&threads, &num_threads); |
| for (i=0; i<num_threads; i++) { |
| if (threads[i]->id != get_thread_id()) { |
| LOG(GLOBAL, LOG_ASYNCH, 1, "Suspending thread "TIDFMT" == "PFX"\n", |
| tr->id, tr->handle); |
| SuspendThread(threads[i]->handle); |
| } |
| } |
| mutex_unlock(&thread_initexit_lock); |
| global_heap_free(threads, num_threads*sizeof(thread_record_t*) |
| HEAPACCT(ACCT_THREAD_MGT)); |
| |
| LOG(GLOBAL, LOG_ASYNCH, 1, "INVALIDATING ENTIRE NTDLL.DLL!!!\n"); |
| pc = start; |
| while (pc < stop) { |
| LOG(GLOBAL, LOG_ASYNCH, 1, "\t"PFX"\n", pc); |
| #if 0 |
| pc = make_inaccessible(pc); |
| #else |
| pc = make_writable_incr(pc); |
| #endif |
| } |
| #if 1 |
| for (pc=start; pc<stop; pc++) { |
| *pc = 0xcc; |
| } |
| #endif |
| } |
| #endif /* TRACK_NTDLL */ |
| /* |
| ****************************************************************************/ |
| |
| /* If we receive an asynch event while we've lost control but before we |
| * reach the image entry point or our other retakeover points we should |
| * retakeover, to minimize the amount of code run natively -- these should |
| * be rare during init and perf hit of repeated flushing and re-walking |
| * memory list shouldn't be an issue. |
| * Separated from asynch_take_over to not force its callers to do this. |
| */ |
| static inline void |
| asynch_retakeover_if_native() |
| { |
| thread_record_t *tr = thread_lookup(get_thread_id()); |
| ASSERT(tr != NULL); |
| if (IS_UNDER_DYN_HACK(tr->under_dynamo_control)) { |
| ASSERT(!reached_image_entry_yet()); |
| /* must do a complete takeover-after-native */ |
| retakeover_after_native(tr, INTERCEPT_EARLY_ASYNCH); |
| } |
| } |
| |
| /* This routine is called by a DynamoRIO routine that was invoked natively, |
| * i.e., not under DynamoRIO control. |
| * This routine takes control using the application state in its arguments, |
| * and starts execution under DynamoRIO at start_pc. |
| * state->callee_arg is a boolean "save_dcontext": |
| * If save_dcontext is true, it saves the cur dcontext on the callback stack |
| * of dcontexts and proceeds to execute with a new dcontext. |
| * Otherwise, it uses the current dcontext, which has its trace squashed. |
| */ |
| static void |
| asynch_take_over(app_state_at_intercept_t *state) |
| { |
| dcontext_t *dcontext; |
| bool save_dcontext = (bool)(ptr_uint_t) state->callee_arg; |
| if (save_dcontext) { |
| /* save cur dcontext and get a new one */ |
| dcontext = callback_setup(state->start_pc); |
| } else { |
| dcontext = get_thread_private_dcontext(); |
| ASSERT(dcontext->initialized); |
| /* case 9347 we want to let go after image entry point */ |
| if (RUNNING_WITHOUT_CODE_CACHE() |
| && dcontext->next_tag == BACK_TO_NATIVE_AFTER_SYSCALL |
| && state->start_pc == image_entry_pc) { |
| ASSERT(dcontext->native_exec_postsyscall == image_entry_pc); |
| } else { |
| ASSERT(!RUNNING_WITHOUT_CODE_CACHE()); |
| dcontext->next_tag = state->start_pc; |
| } |
| /* if we were building a trace, kill it */ |
| if (is_building_trace(dcontext)) { |
| LOG(THREAD, LOG_ASYNCH, 2, "asynch_take_over: squashing old trace\n"); |
| trace_abort(dcontext); |
| } |
| } |
| ASSERT(os_using_app_state(dcontext)); |
| LOG(THREAD, LOG_ASYNCH, 2, "asynch_take_over 0x%08x\n", state->start_pc); |
| /* may have been inside syscall...now we're in app! */ |
| set_at_syscall(dcontext, false); |
| /* tell dispatch() why we're coming there */ |
| if (dcontext->whereami != WHERE_APP) /* new thread, typically: leave it that way */ |
| dcontext->whereami = WHERE_TRAMPOLINE; |
| set_last_exit(dcontext, (linkstub_t *) get_asynch_linkstub()); |
| |
| transfer_to_dispatch(dcontext, &state->mc, false/*!full_DR_state*/); |
| ASSERT_NOT_REACHED(); |
| } |
| |
| bool |
| new_thread_is_waiting_for_dr_init(thread_id_t tid, app_pc pc) |
| { |
| uint i; |
| /* i#1443c#4: check for a thread that's about to hit our hook */ |
| if (pc == LdrInitializeThunk || pc == (app_pc)KiUserApcDispatcher) |
| return true; |
| /* We check until the max to avoid races on threads_waiting_count */ |
| for (i = 0; i < MAX_THREADS_WAITING_FOR_DR_INIT; i++) { |
| if (threads_waiting_for_dr_init[i] == tid) |
| return true; |
| } |
| return false; |
| } |
| |
| static void |
| possible_new_thread_wait_for_dr_init(CONTEXT *cxt) |
| { |
| /* Because of problems with injected threads while we are initializing |
| * (case 5167, 5020, 5103 bunch of others) we block here while the main |
| * thread finishes initializing. Once dynamo_exited is set is safe to |
| * let the thread continue since dynamo_thread_init will imediately |
| * return. */ |
| uint idx; |
| #ifdef CLIENT_SIDELINE |
| /* We allow a client init routine to create client threads: DR is |
| * initialized enough by now |
| */ |
| if (((void *)cxt->CXT_XIP == (void *)client_thread_target)) |
| return; |
| #endif |
| |
| if (dynamo_initialized || dynamo_exited) |
| return; |
| |
| /* i#1443: communicate with os_take_over_all_unknown_threads() */ |
| idx = atomic_add_exchange_int((volatile int *)&threads_waiting_count, 1); |
| idx--; /* -1 to get index from count */ |
| ASSERT(idx < MAX_THREADS_WAITING_FOR_DR_INIT); |
| if (idx >= MAX_THREADS_WAITING_FOR_DR_INIT) { |
| /* What can we do? We'll have to risk it and hope this thread is scheduled |
| * and initializes before os_take_over_all_unknown_threads() runs. |
| */ |
| } else { |
| threads_waiting_for_dr_init[idx] = get_thread_id(); |
| } |
| |
| while (!dynamo_initialized && !dynamo_exited) { |
| STATS_INC(apc_yields_while_initializing); |
| os_thread_yield(); |
| } |
| |
| if (idx < MAX_THREADS_WAITING_FOR_DR_INIT) { |
| /* os_take_over_all_unknown_threads()'s context check will work from here */ |
| threads_waiting_for_dr_init[idx] = INVALID_THREAD_ID; |
| } |
| } |
| |
| /* returns true if intercept function should return immediately and let go, |
| * false if intercept function should continue processing and maybe takeover */ |
| static bool |
| intercept_new_thread(CONTEXT *cxt) |
| { |
| #ifdef CLIENT_INTERFACE |
| bool is_client = false; |
| #endif |
| byte *dstack = NULL; |
| priv_mcontext_t mc; |
| /* init apc, check init_apc_go_native to sync w/detach */ |
| if (init_apc_go_native) { |
| /* need to wait after checking _go_native to avoid a thread |
| * going native to early because of races between setting |
| * _go_native and _pause */ |
| if (init_apc_go_native_pause) { |
| /* FIXME : this along with any other logging in this |
| * method could potentially be race condition with detach |
| * cleanup, though is unlikely */ |
| LOG(GLOBAL, LOG_ALL, 2, |
| "Thread waiting at init_apc for detach to finish\n"); |
| } |
| while (init_apc_go_native_pause) { |
| os_thread_yield(); |
| } |
| /* just return, FIXME : see concerns in detach_helper about |
| * getting to native code before the interception_code is |
| * freed and getting out of here before the dll is unloaded |
| */ |
| # if 0 /* this is not a dynamo controlled thread! */ |
| SELF_PROTECT_LOCAL(get_thread_private_dcontext(), READONLY); |
| # endif |
| return true /* exit intercept function and let go */; |
| } |
| |
| /* should keep in sync with changes in intercept_image_entry() for |
| * thread initialization |
| */ |
| |
| /* initialize thread now */ |
| #ifdef CLIENT_SIDELINE |
| /* i#41/PR 222812: client threads target a certain routine and always |
| * directly never via win API (so we don't check THREAT_START_ADDR) |
| */ |
| is_client = ((void *)cxt->CXT_XIP == (void *)client_thread_target); |
| if (is_client) { |
| /* client threads start out on dstack */ |
| GET_STACK_PTR(dstack); |
| ASSERT(is_dynamo_address(dstack)); |
| /* we assume that less than a page will have been used */ |
| dstack = (byte *) ALIGN_FORWARD(dstack, PAGE_SIZE); |
| } |
| #endif |
| context_to_mcontext(&mc, cxt); |
| if (dynamo_thread_init(dstack, &mc _IF_CLIENT_INTERFACE(is_client)) != -1) { |
| app_pc thunk_xip = (app_pc)cxt->CXT_XIP; |
| dcontext_t *dcontext = get_thread_private_dcontext(); |
| LOG_DECLARE(char sym_buf[MAXIMUM_SYMBOL_LENGTH];) |
| bool is_nudge_thread = false; |
| |
| #ifdef CLIENT_SIDELINE |
| if (is_client) { |
| ASSERT(is_on_dstack(dcontext, (byte *)cxt->CXT_XSP)); |
| /* PR 210591: hide our threads from DllMain by not executing rest |
| * of Ldr init code and going straight to target. create_thread() |
| * already set up the arg in cxt. |
| */ |
| nt_continue(cxt); |
| ASSERT_NOT_REACHED(); |
| } |
| #endif |
| |
| /* Xref case 552, to ameliorate the risk of an attacker |
| * leveraging our detach routines etc. against us, we detect |
| * an incoming nudge thread here during thread init and set |
| * a dcontext flag that the nudge routines can later verify. |
| * Attacker could still bypass if can control the start addr |
| * of a new thread (FIXME). We check both Xax and Xip since |
| * nodemgr has the ability to target directly or send through |
| * kernel32 start thunk (though only start thunk, i.e. xax, |
| * is currently used). If we move to just directly targeted, |
| * i.e. xip, would be a lot harder for the attacker since |
| * the documented API routines all hardcode that value. |
| * |
| * The nudge related checks below were moved above thread_policy checks |
| * because there is no dependency and because process control nudge for |
| * thin_client needs it; part of cases 8884, 8594 & 8888. */ |
| ASSERT(dcontext != NULL && dcontext->nudge_target == NULL); |
| if ((void *)cxt->CXT_XIP == (void *)generic_nudge_target || |
| (void *)cxt->THREAD_START_ADDR == (void *)generic_nudge_target) { |
| LOG(THREAD, LOG_ALL, 1, "Thread targeting nudge.\n"); |
| if (dcontext != NULL) { |
| dcontext->nudge_target = (void *)generic_nudge_target; |
| } |
| is_nudge_thread = true; |
| } |
| /* FIXME: temporary fix for case 9467 - mute nudges for cygwin apps. |
| * Long term fix is to make nudge threads go directly to their targets. |
| */ |
| if (is_nudge_thread && DYNAMO_OPTION(thin_client) && DYNAMO_OPTION(mute_nudge)) { |
| TRY_EXCEPT(dcontext, { /* to prevent crashes when walking the ldr list */ |
| PEB *peb = get_own_peb(); |
| PEB_LDR_DATA *ldr = peb->LoaderData; |
| LIST_ENTRY *e; |
| LIST_ENTRY *start = &ldr->InLoadOrderModuleList; |
| LDR_MODULE *mod; |
| uint traversed = 0; |
| |
| /* Note: this loader module list walk is racy with the loader; |
| * can't really grab the loader lock here. Shouldn't be a big |
| * problem as this is a temp fix anyway. */ |
| for (e = start->Flink; e != start; e = e->Flink) { |
| mod = (LDR_MODULE *) e; |
| if (wcsstr(mod->BaseDllName.Buffer, L"cygwin1.dll") != NULL) { |
| os_terminate(dcontext, TERMINATE_THREAD|TERMINATE_CLEANUP); |
| ASSERT_NOT_REACHED(); |
| } |
| if (traversed++ > MAX_MODULE_LIST_INFINITE_LOOP_THRESHOLD) { |
| SYSLOG_INTERNAL_WARNING("nudge muting: too many modules"); |
| break; |
| } |
| } |
| }, { /* do nothing */ }); |
| } |
| |
| /* For thin_client, let go right after we init the thread, i.e., create |
| * the dcontext; don't do the thread policy stuff, that requires locks |
| * that aren't initialized in this mode! */ |
| if (DYNAMO_OPTION(thin_client)) |
| return true /* exit intercept function and let go */; |
| |
| /* In fact the apc_target is ntdll!LdrInitializeThunk |
| * (for all threads not only the first one). |
| * Note for vista that threads do not start with an apc, but rather |
| * directly show up at ntdll!LdrInitializeThunk (which we hook on |
| * vista to call this routine). Note that the thunk will return via |
| * an NtContinue to a context on the stack so really we see the same |
| * behavior as before except we don't go through the apc dispatcher. |
| * |
| * For threads created by kernel32!CreateRemoteThread pre vista |
| * the cxt->Xip then is kernel32!Base{Process,Thread}StartThunk (not exported), |
| * while the cxt->Xax is the user thread procedure and cxt->Xbx is the arg. |
| * On vista it's the same except cxt->Xip is set to ntdll!RtlUserThreadStart |
| * (which is exported in ntdll.dll) by the kernel. |
| * |
| * kernel32!BaseProcessStartThunk, or kernel32!BaseThreadStartThunk |
| * on all versions I've tested start with |
| * 0xed33 xor ebp,ebp |
| * |
| * Note, of course, that direct NtCreateThread calls |
| * can go anywhere they want (including on Vista). For example toolhelp |
| * uses NTDLL!RtlpQueryProcessDebugInformationRemote |
| * as the xip so shouldn't count much on this. NtCreateThreadEx threads |
| * (vista only) will, however, always have xip=ntdll!RtlUserThreadStart |
| * since the kernel sets that. |
| */ |
| /* keep in mind this is a 16-bit match */ |
| #define BASE_THREAD_START_THUNK_USHORT 0xed33 |
| |
| /* see comments in os.c pre_system_call CreateThread, Xax holds |
| * the win32 start address (Nebbett), Xbx holds the argument |
| * (observation). Same appears to hold for CreateThreadEx. */ |
| /* Note that the initial thread won't log here */ |
| LOG(THREAD_GET, LOG_THREADS, 1, |
| "New Thread : Win32 start address "PFX" arg "PFX", thunk xip="PFX"\n", |
| cxt->THREAD_START_ADDR, cxt->THREAD_START_ARG, cxt->CXT_XIP); |
| DOLOG(1, LOG_THREADS, { |
| print_symbolic_address((app_pc)cxt->THREAD_START_ADDR, sym_buf, sizeof(sym_buf), |
| false); |
| LOG(THREAD_GET, LOG_THREADS, 1, |
| "Symbol information for start address %s\n", sym_buf); |
| }); |
| DOLOG(2, LOG_THREADS, { |
| print_symbolic_address((app_pc)cxt->CXT_XIP, sym_buf, sizeof(sym_buf), |
| false); |
| LOG(THREAD_GET, LOG_THREADS, 2, |
| "Symbol information for thunk address %s\n", sym_buf); |
| }); |
| |
| /* start address should be set at thread initialization */ |
| if (dcontext->win32_start_addr == (app_pc)cxt->THREAD_START_ARG) { |
| /* case 10965/PR 215400: WOW64 & x64 query returns arg for some reason */ |
| #ifndef X64 |
| ASSERT(is_wow64_process(NT_CURRENT_PROCESS)); |
| #endif |
| dcontext->win32_start_addr = (app_pc)cxt->THREAD_START_ADDR; |
| } |
| |