| /* ********************************************************** |
| * Copyright (c) 2011-2014 Google, Inc. All rights reserved. |
| * Copyright (c) 2001-2010 VMware, Inc. All rights reserved. |
| * ********************************************************** */ |
| |
| /* |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * * Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * * Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * * Neither the name of VMware, Inc. nor the names of its contributors may be |
| * used to endorse or promote products derived from this software without |
| * specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE |
| * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH |
| * DAMAGE. |
| */ |
| |
| /* Copyright (c) 2003-2007 Determina Corp. */ |
| /* Copyright (c) 2001-2003 Massachusetts Institute of Technology */ |
| /* Copyright (c) 2001 Hewlett-Packard Company */ |
| |
| /* |
| * x86.asm - x86 specific assembly and trampoline code |
| * |
| * This file is used for both linux and windows. |
| * We used to use the gnu assembler on both platforms, but |
| * gas does not support 64-bit windows. |
| * Thus we now use masm on windows and gas with the new intel-syntax-specifying |
| * options so that our code here only needs a minimum of macros to |
| * work on both. |
| * |
| * Note that for gas on cygwin we used to need to prepend _ to global |
| * symbols: we don't need that for linux gas or masm so we don't do it anymore. |
| */ |
| |
| /* We handle different registers and calling conventions with a CPP pass. |
| * It can be difficult to choose registers that work across all ABI's we're |
| * trying to support: we need to move each ARG into a register in case |
| * it is passed in memory, but we have to pick registers that don't already |
| * hold other arguments. Typically, use this order: |
| * REG_XAX, REG_XBX, REG_XDI, REG_XSI, REG_XDX, REG_XCX |
| * Note that REG_XBX is by convention used on linux for PIC base: if we want |
| * to try and avoid relocations (case 7852) we should avoid using it |
| * to avoid confusion (though we can always pick a different register, |
| * even varying by function). |
| * FIXME: should we use virtual registers instead? |
| * FIXME: should we have ARG1_IN_REG macro that is either nop or load from stack? |
| * For now not bothering, but if we add more routines we'll want more support. |
| * Naturally the ARG* macros are only valid at function entry. |
| */ |
| |
| #include "../asm_defines.asm" |
| START_FILE |
| |
| #ifdef UNIX |
| # ifdef LINUX |
| # include "include/syscall.h" |
| # else |
| # include "include/syscall_mach.h" |
| # include <sys/syscall.h> |
| # endif |
| #endif |
| |
| #define RESTORE_FROM_DCONTEXT_VIA_REG(reg,offs,dest) mov dest, PTRSZ [offs + reg] |
| #define SAVE_TO_DCONTEXT_VIA_REG(reg,offs,src) mov PTRSZ [offs + reg], src |
| |
| /* For the few remaining dcontext_t offsets we need here: */ |
| #if defined(WINDOWS) && !defined(X64) |
| # define UPCXT_BEFORE_INLINE_SLOTS 4 /* at_syscall + padding */ |
| #else |
| # define UPCXT_BEFORE_INLINE_SLOTS 8 /* IF_UNIX(errno +) at_syscall + padding */ |
| #endif |
| |
| /* Count the slots for client clean call inlining. */ |
| #ifdef CLIENT_INTERFACE |
| /* Add CLEANCALL_NUM_INLINE_SLOTS(5) * ARG_SZ for these slots. No padding. */ |
| # define UPCXT_EXTRA (UPCXT_BEFORE_INLINE_SLOTS + 5 * ARG_SZ) |
| #else |
| # define UPCXT_EXTRA UPCXT_BEFORE_INLINE_SLOTS |
| #endif |
| |
| /* We should give asm_defines.asm all unique names and then include globals.h |
| * and avoid all this duplication! |
| */ |
| #ifdef X64 |
| # ifdef WINDOWS |
| # define NUM_XMM_SLOTS 6 /* xmm0-5 */ |
| # else |
| # define NUM_XMM_SLOTS 16 /* xmm0-15 */ |
| # endif |
| # define PRE_XMM_PADDING 16 |
| #else |
| # define NUM_XMM_SLOTS 8 /* xmm0-7 */ |
| # define PRE_XMM_PADDING 24 |
| #endif |
| #define XMM_SAVED_REG_SIZE 32 /* for ymm */ |
| /* xmm0-5/7/15 for PR 264138/i#139/PR 302107 */ |
| #define XMM_SAVED_SIZE ((NUM_XMM_SLOTS)*(XMM_SAVED_REG_SIZE)) |
| |
| /* Should we generate all of our asm code instead of having it static? |
| * As it is we're duplicating insert_push_all_registers(), dr_insert_call(), etc., |
| * but it's not that much code here in these macros, and this is simpler |
| * than emit_utils.c-style code. |
| */ |
| #ifdef X64 |
| /* push GPR registers in priv_mcontext_t order. does NOT make xsp have a |
| * pre-push value as no callers need that (they all use PUSH_PRIV_MCXT). |
| * Leaves space for, but does NOT fill in, the xmm0-5 slots (PR 264138), |
| * since it's hard to dynamically figure out during bootstrapping whether |
| * movdqu or movups are legal instructions. The caller is expected |
| * to fill in the xmm values prior to any calls that may clobber them. |
| */ |
| # define PUSHGPR \ |
| push r15 @N@\ |
| push r14 @N@\ |
| push r13 @N@\ |
| push r12 @N@\ |
| push r11 @N@\ |
| push r10 @N@\ |
| push r9 @N@\ |
| push r8 @N@\ |
| push rax @N@\ |
| push rcx @N@\ |
| push rdx @N@\ |
| push rbx @N@\ |
| /* not the pusha pre-push rsp value but see above */ @N@\ |
| push rsp @N@\ |
| push rbp @N@\ |
| push rsi @N@\ |
| push rdi |
| # define POPGPR \ |
| pop rdi @N@\ |
| pop rsi @N@\ |
| pop rbp @N@\ |
| pop rbx /* rsp into dead rbx */ @N@\ |
| pop rbx @N@\ |
| pop rdx @N@\ |
| pop rcx @N@\ |
| pop rax @N@\ |
| pop r8 @N@\ |
| pop r9 @N@\ |
| pop r10 @N@\ |
| pop r11 @N@\ |
| pop r12 @N@\ |
| pop r13 @N@\ |
| pop r14 @N@\ |
| pop r15 @N@ |
| # define PRIV_MCXT_SIZE (18*ARG_SZ + PRE_XMM_PADDING + XMM_SAVED_SIZE) |
| # define dstack_OFFSET (PRIV_MCXT_SIZE+UPCXT_EXTRA+3*ARG_SZ) |
| # define MCONTEXT_PC_OFFS (17*ARG_SZ) |
| #else |
| # define PUSHGPR \ |
| pusha |
| # define POPGPR \ |
| popa |
| # define PRIV_MCXT_SIZE (10*ARG_SZ + PRE_XMM_PADDING + XMM_SAVED_SIZE) |
| # define dstack_OFFSET (PRIV_MCXT_SIZE+UPCXT_EXTRA+3*ARG_SZ) |
| # define MCONTEXT_PC_OFFS (9*ARG_SZ) |
| #endif |
| /* offsetof(dcontext_t, is_exiting) */ |
| #define is_exiting_OFFSET (dstack_OFFSET+1*ARG_SZ) |
| #define PUSHGPR_XSP_OFFS (3*ARG_SZ) |
| #define MCONTEXT_XSP_OFFS (PUSHGPR_XSP_OFFS) |
| #define PUSH_PRIV_MCXT_PRE_PC_SHIFT (- XMM_SAVED_SIZE - PRE_XMM_PADDING) |
| |
| /* Pushes a priv_mcontext_t on the stack, with an xsp value equal to the |
| * xsp before the pushing. Clobbers xax! |
| * Does fill in xmm0-5, if necessary, for PR 264138. |
| * Assumes that DR has been initialized (get_xmm_vals() checks proc feature bits). |
| * Caller should ensure 16-byte stack alignment prior to the push (PR 306421). |
| */ |
| #define PUSH_PRIV_MCXT(pc) \ |
| lea REG_XSP, [REG_XSP + PUSH_PRIV_MCXT_PRE_PC_SHIFT] @N@\ |
| push pc @N@\ |
| PUSHF @N@\ |
| PUSHGPR @N@\ |
| lea REG_XAX, [REG_XSP] @N@\ |
| CALLC1(GLOBAL_REF(get_xmm_vals), REG_XAX) @N@\ |
| lea REG_XAX, [PRIV_MCXT_SIZE + REG_XSP] @N@\ |
| mov [PUSHGPR_XSP_OFFS + REG_XSP], REG_XAX |
| |
| /* Pops the GPRs and flags from a priv_mcontext off the stack. Does not |
| * restore xmm/ymm regs. |
| */ |
| #define POP_PRIV_MCXT_GPRS() \ |
| POPGPR @N@\ |
| POPF @N@\ |
| lea REG_XSP, [REG_XSP - PUSH_PRIV_MCXT_PRE_PC_SHIFT + ARG_SZ/*pc*/] |
| |
| /* This is really the alignment needed by x64 code. For now, when we bother to |
| * align the stack pointer, we just go for 16 byte alignment. We do *not* |
| * assume 16-byte alignment across the code base. |
| * i#847: Investigate using aligned SSE ops (see get_xmm_caller_saved). |
| */ |
| #define FRAME_ALIGNMENT 16 |
| |
| /****************************************************************************/ |
| /****************************************************************************/ |
| |
| DECL_EXTERN(unexpected_return) |
| |
| #ifndef NOT_DYNAMORIO_CORE_PROPER |
| |
| DECL_EXTERN(get_own_context_integer_control) |
| DECL_EXTERN(get_xmm_vals) |
| DECL_EXTERN(auto_setup) |
| DECL_EXTERN(return_from_native) |
| DECL_EXTERN(native_module_callout) |
| DECL_EXTERN(dispatch) |
| #ifdef DR_APP_EXPORTS |
| DECL_EXTERN(dr_app_start_helper) |
| #endif |
| DECL_EXTERN(dynamo_process_exit) |
| DECL_EXTERN(dynamo_thread_exit) |
| DECL_EXTERN(dynamo_thread_stack_free_and_exit) |
| DECL_EXTERN(dynamorio_app_take_over_helper) |
| DECL_EXTERN(found_modified_code) |
| DECL_EXTERN(get_cleanup_and_terminate_global_do_syscall_entry) |
| #ifdef INTERNAL |
| DECL_EXTERN(internal_error) |
| #endif |
| DECL_EXTERN(internal_exception_info) |
| DECL_EXTERN(is_currently_on_dstack) |
| DECL_EXTERN(nt_continue_setup) |
| #if defined(UNIX) |
| DECL_EXTERN(master_signal_handler_C) |
| #endif |
| #ifdef MACOS |
| DECL_EXTERN(new_bsdthread_setup) |
| #endif |
| DECL_EXTERN(hashlookup_null_target) |
| #if defined(UNIX) && !defined(HAVE_SIGALTSTACK) |
| DECL_EXTERN(sig_should_swap_stack) |
| DECL_EXTERN(fixup_rtframe_pointers) |
| # define CLONE_AND_SWAP_STRUCT_SIZE 2*ARG_SZ |
| #endif |
| #ifdef UNIX |
| DECL_EXTERN(dr_setjmp_sigmask) |
| DECL_EXTERN(privload_early_inject) |
| DECL_EXTERN(dynamorio_dl_fixup) |
| #endif |
| #ifdef WINDOWS |
| DECL_EXTERN(dynamorio_earliest_init_takeover_C) |
| DECL_EXTERN(os_terminate_wow64_stack) |
| #endif |
| |
| /* non-functions: these make us non-PIC! (PR 212290) */ |
| DECL_EXTERN(exiting_thread_count) |
| DECL_EXTERN(initstack) |
| DECL_EXTERN(initstack_mutex) |
| DECL_EXTERN(int_syscall_address) |
| DECL_EXTERN(syscalls) |
| DECL_EXTERN(sysenter_ret_address) |
| DECL_EXTERN(sysenter_tls_offset) |
| #ifdef WINDOWS |
| DECL_EXTERN(wow64_index) |
| # ifdef X64 |
| DECL_EXTERN(syscall_argsz) |
| # endif |
| #endif |
| |
| |
| #ifdef WINDOWS |
| /* dynamo_auto_start: used for non-early follow children. |
| * Assumptions: The saved priv_mcontext_t for the start of the app is on |
| * the stack, followed by a pointer to a region of memory to free |
| * (which can be NULL) and its size. This routine is reached by a jmp |
| * so be aware of that for address calculation. This routine does |
| * not return. |
| * |
| * On win32, note that in order to export this from the dynamo dll, which is |
| * required for non early follow children, we have to explicitly tell the |
| * linker to do so. This is done in the Makefile. |
| * Note that if it weren't for wanting local go-native code we would have |
| * auto_setup in x86_code.c be dynamo_auto_start. |
| */ |
| DECLARE_FUNC(dynamo_auto_start) |
| GLOBAL_LABEL(dynamo_auto_start:) |
| /* we pass a pointer to TOS as a parameter. |
| * a param in xsp won't work w/ win64 padding so put in xax */ |
| mov REG_XAX, REG_XSP |
| CALLC1(GLOBAL_REF(auto_setup), REG_XAX) |
| /* if auto_setup returns, we need to go native */ |
| jmp load_dynamo_failure |
| END_FUNC(dynamo_auto_start) |
| #endif |
| |
| #ifdef UNIX |
| /* We avoid performance problems with messing up the RSB by using |
| * a separate routine. The caller needs to use a plain call |
| * with _GLOBAL_OFFSET_TABLE_ on the exact return address instruction. |
| */ |
| DECLARE_FUNC(get_pic_xdi) |
| GLOBAL_LABEL(get_pic_xdi:) |
| mov REG_XDI, PTRSZ [REG_XSP] |
| ret |
| END_FUNC(get_pic_xdi) |
| #endif |
| |
| /* void call_switch_stack(dcontext_t *dcontext, // 1*ARG_SZ+XAX |
| * byte *stack, // 2*ARG_SZ+XAX |
| * void (*func)(dcontext_t *), // 3*ARG_SZ+XAX |
| * void *mutex_to_free, // 4*ARG_SZ+XAX |
| * bool return_on_return) // 5*ARG_SZ+XAX |
| */ |
| DECLARE_FUNC(call_switch_stack) |
| GLOBAL_LABEL(call_switch_stack:) |
| /* get all args with same offset(xax) regardless of plaform */ |
| #ifdef X64 |
| # ifdef WINDOWS |
| mov REG_XAX, REG_XSP |
| /* stack alignment doesn't really matter (b/c we're swapping) but in case |
| * we add a call we keep this here |
| */ |
| lea REG_XSP, [-ARG_SZ + REG_XSP] /* maintain align-16: offset retaddr */ |
| # else |
| /* no padding so we make our own space. odd #slots keeps align-16 w/ retaddr */ |
| lea REG_XSP, [-5*ARG_SZ + REG_XSP] |
| /* xax points one beyond TOS to get same offset as having retaddr there */ |
| lea REG_XAX, [-ARG_SZ + REG_XSP] |
| mov [5*ARG_SZ + REG_XAX], ARG5 |
| # endif |
| mov [1*ARG_SZ + REG_XAX], ARG1 |
| mov [2*ARG_SZ + REG_XAX], ARG2 |
| mov [3*ARG_SZ + REG_XAX], ARG3 |
| mov [4*ARG_SZ + REG_XAX], ARG4 |
| #else |
| /* stack alignment doesn't matter */ |
| mov REG_XAX, REG_XSP |
| #endif |
| /* we need a callee-saved reg across our call so save it onto stack */ |
| push REG_XBX |
| mov REG_XBX, REG_XAX |
| /* alignment doesn't matter: swapping stacks */ |
| push IF_X64_ELSE(r12, REG_XDI) /* xdi is used for func param in X64 */ |
| mov IF_X64_ELSE(r12, REG_XDI), REG_XSP |
| /* set up for call */ |
| mov REG_XDX, [3*ARG_SZ + REG_XAX] /* func */ |
| mov REG_XCX, [1*ARG_SZ + REG_XAX] /* dcontext */ |
| mov REG_XSP, [2*ARG_SZ + REG_XAX] /* stack */ |
| cmp PTRSZ [4*ARG_SZ + REG_XAX], 0 /* mutex_to_free */ |
| je call_dispatch_alt_stack_no_free |
| mov REG_XAX, [4*ARG_SZ + REG_XAX] |
| mov DWORD [REG_XAX], 0 |
| call_dispatch_alt_stack_no_free: |
| CALLC1(REG_XDX, REG_XCX) |
| mov REG_XSP, IF_X64_ELSE(r12, REG_XDI) |
| mov REG_XAX, REG_XBX |
| cmp BYTE [5*ARG_SZ + REG_XAX], 0 /* return_on_return */ |
| je GLOBAL_REF(unexpected_return) |
| pop IF_X64_ELSE(r12, REG_XDI) |
| pop REG_XBX |
| #ifdef X64 |
| # ifdef WINDOWS |
| mov REG_XSP, REG_XAX |
| # else |
| lea REG_XSP, [5*ARG_SZ + REG_XSP] |
| # endif |
| #else |
| mov REG_XSP, REG_XAX |
| #endif |
| ret |
| END_FUNC(call_switch_stack) |
| |
| #ifdef CLIENT_INTERFACE |
| /* |
| * Calls the specified function 'func' after switching to the DR stack |
| * for the thread corresponding to 'drcontext'. |
| * Passes in 8 arguments. Uses the C calling convention, so 'func' will work |
| * just fine even if if takes fewer than 8 args. |
| * Swaps the stack back upon return and returns the value returned by 'func'. |
| * |
| * void * dr_call_on_clean_stack(void *drcontext, // 1*ARG_SZ+XAX |
| * void *(*func)(arg1...arg8), // 2*ARG_SZ+XAX |
| * void *arg1, // 3*ARG_SZ+XAX |
| * void *arg2, // 4*ARG_SZ+XAX |
| * void *arg3, // 5*ARG_SZ+XAX |
| * void *arg4, // 6*ARG_SZ+XAX |
| * void *arg5, // 7*ARG_SZ+XAX |
| * void *arg6, // 8*ARG_SZ+XAX |
| * void *arg7, // 9*ARG_SZ+XAX |
| * void *arg8) //10*ARG_SZ+XAX |
| */ |
| DECLARE_EXPORTED_FUNC(dr_call_on_clean_stack) |
| GLOBAL_LABEL(dr_call_on_clean_stack:) |
| /* avoid colliding with ARG* in either scratch reg */ |
| # ifdef X64 |
| # define SCRATCH1 r10 |
| # define SCRATCH2 r11 |
| # else |
| # define SCRATCH1 edx |
| # define SCRATCH2 ecx |
| # endif |
| /* get all args with same offset(xax) regardless of plaform */ |
| # ifdef X64 |
| # ifdef WINDOWS |
| mov REG_XAX, REG_XSP |
| /* stack alignment doesn't really matter (b/c we're swapping) but in case |
| * we add a call we keep this here |
| */ |
| lea REG_XSP, [-ARG_SZ + REG_XSP] /* maintain align-16: offset retaddr */ |
| # else |
| /* no padding so we make our own space. odd #slots keeps align-16 w/ retaddr */ |
| lea REG_XSP, [-5*ARG_SZ + REG_XSP] |
| /* xax points one beyond TOS to get same offset as having retaddr there */ |
| lea REG_XAX, [-ARG_SZ + REG_XSP] |
| /* save the retaddr */ |
| mov SCRATCH1, [6*ARG_SZ + REG_XAX] |
| mov [5*ARG_SZ + REG_XAX], ARG5 |
| mov [6*ARG_SZ + REG_XAX], ARG6 |
| # endif |
| mov [1*ARG_SZ + REG_XAX], ARG1 |
| mov [2*ARG_SZ + REG_XAX], ARG2 |
| mov [3*ARG_SZ + REG_XAX], ARG3 |
| mov [4*ARG_SZ + REG_XAX], ARG4 |
| # else |
| /* stack alignment doesn't matter */ |
| mov REG_XAX, REG_XSP |
| # endif |
| # if defined(X64) && !defined(WINDOWS) |
| push SCRATCH1 /* retaddr */ |
| # endif |
| /* we need a callee-saved reg across our call so save it onto stack */ |
| push REG_XBX |
| push REG_XBP /* alignment doesn't matter: swapping stacks */ |
| mov REG_XBX, REG_XAX |
| mov REG_XBP, REG_XSP |
| /* set up for call */ |
| mov SCRATCH1, [2*ARG_SZ + REG_XAX] /* func */ |
| mov SCRATCH2, [1*ARG_SZ + REG_XAX] /* drcontext */ |
| RESTORE_FROM_DCONTEXT_VIA_REG(SCRATCH2, dstack_OFFSET, REG_XSP) |
| STACK_PAD_NOPUSH(8, 4, 0) |
| mov SCRATCH2, [10*ARG_SZ + REG_XAX] |
| mov ARG8_NORETADDR, SCRATCH2 |
| mov SCRATCH2, [9*ARG_SZ + REG_XAX] |
| mov ARG7_NORETADDR, SCRATCH2 |
| mov SCRATCH2, [8*ARG_SZ + REG_XAX] |
| mov ARG6_NORETADDR, SCRATCH2 |
| mov SCRATCH2, [7*ARG_SZ + REG_XAX] |
| mov ARG5_NORETADDR, SCRATCH2 |
| mov SCRATCH2, [6*ARG_SZ + REG_XAX] |
| mov ARG4_NORETADDR, SCRATCH2 |
| mov SCRATCH2, [5*ARG_SZ + REG_XAX] |
| mov ARG3_NORETADDR, SCRATCH2 |
| mov SCRATCH2, [4*ARG_SZ + REG_XAX] |
| mov ARG2_NORETADDR, SCRATCH2 |
| mov SCRATCH2, [3*ARG_SZ + REG_XAX] |
| mov ARG1_NORETADDR, SCRATCH2 |
| call SCRATCH1 |
| /* preserve return value in xax */ |
| STACK_UNPAD(8, 4, 0) |
| mov REG_XSP, REG_XBP |
| mov REG_XCX, REG_XBX |
| pop REG_XBP |
| pop REG_XBX |
| # ifdef X64 |
| # ifdef WINDOWS |
| mov REG_XSP, REG_XCX |
| # else |
| pop SCRATCH1 /* retaddr */ |
| lea REG_XSP, [5*ARG_SZ + REG_XSP] |
| mov PTRSZ [REG_XSP], SCRATCH1 /* retaddr */ |
| # endif |
| # else |
| mov REG_XSP, REG_XCX |
| # endif |
| ret |
| END_FUNC(dr_call_on_clean_stack) |
| #endif /* CLIENT_INTERFACE */ |
| |
| /* |
| * Copies from the current xsp to tos onto the base of stack and then |
| * swaps to the cloned top of stack. |
| * |
| * void clone_and_swap_stack(byte *stack, byte *tos) |
| */ |
| DECLARE_FUNC(clone_and_swap_stack) |
| GLOBAL_LABEL(clone_and_swap_stack:) |
| mov REG_XAX, ARG1 |
| mov REG_XCX, ARG2 |
| mov REG_XDX, REG_XSP |
| /* save not-always-caller-saved regs */ |
| push REG_XSI |
| push REG_XDI |
| /* memcpy(stack - sz, cur_esp, sz) */ |
| sub REG_XCX, REG_XDX /* sz = tos - cur_esp */ |
| mov REG_XSI, REG_XDX /* source = tos */ |
| mov REG_XDI, REG_XAX /* dest = stack - sz */ |
| sub REG_XDI, REG_XCX |
| sub REG_XAX, REG_XCX /* before lose sz, calculate tos on stack */ |
| /* cld from signal handler for app signal should be ok */ |
| cld |
| rep movsb |
| /* restore and swap to cloned stack */ |
| pop REG_XDI |
| pop REG_XSI |
| mov REG_XSP, REG_XAX |
| ret |
| END_FUNC(clone_and_swap_stack) |
| |
| /* |
| * dr_app_start - Causes application to run under Dynamo control |
| */ |
| #ifdef DR_APP_EXPORTS |
| DECLARE_EXPORTED_FUNC(dr_app_start) |
| GLOBAL_LABEL(dr_app_start:) |
| sub REG_XSP, FRAME_ALIGNMENT - ARG_SZ /* Maintain alignment. */ |
| |
| /* grab exec state and pass as param in a priv_mcontext_t struct */ |
| PUSH_PRIV_MCXT(PTRSZ [FRAME_ALIGNMENT - ARG_SZ + REG_XSP -\ |
| PUSH_PRIV_MCXT_PRE_PC_SHIFT]) /* return address as pc */ |
| |
| /* do the rest in C */ |
| lea REG_XAX, [REG_XSP] /* stack grew down, so priv_mcontext_t at tos */ |
| CALLC1(GLOBAL_REF(dr_app_start_helper), REG_XAX) |
| |
| /* if we come back, then DR is not taking control so |
| * clean up stack and return */ |
| add REG_XSP, PRIV_MCXT_SIZE + FRAME_ALIGNMENT - ARG_SZ |
| ret |
| END_FUNC(dr_app_start) |
| |
| /* |
| * dr_app_take_over - For the client interface, we'll export 'dr_app_take_over' |
| * for consistency with the dr_ naming convention of all exported functions. |
| * We'll keep 'dynamorio_app_take_over' for compatibility with the preinjector. |
| */ |
| DECLARE_EXPORTED_FUNC(dr_app_take_over) |
| GLOBAL_LABEL(dr_app_take_over: ) |
| jmp GLOBAL_REF(dynamorio_app_take_over) |
| END_FUNC(dr_app_take_over) |
| |
| /* dr_app_running_under_dynamorio - Indicates whether the current thread |
| * is running within the DynamoRIO code cache. |
| * Returns false (not under dynamorio) by default. |
| * The function is mangled by dynamorio to return true instead when |
| * it is brought into the code cache. |
| */ |
| DECLARE_EXPORTED_FUNC(dr_app_running_under_dynamorio) |
| GLOBAL_LABEL(dr_app_running_under_dynamorio: ) |
| mov eax, 0 |
| ret |
| END_FUNC(dr_app_running_under_dynamorio) |
| #endif |
| |
| /* |
| * dynamorio_app_take_over - Causes application to run under Dynamo |
| * control. Dynamo never releases control. |
| */ |
| DECLARE_EXPORTED_FUNC(dynamorio_app_take_over) |
| GLOBAL_LABEL(dynamorio_app_take_over:) |
| sub REG_XSP, FRAME_ALIGNMENT - ARG_SZ /* Maintain alignment. */ |
| |
| /* grab exec state and pass as param in a priv_mcontext_t struct */ |
| PUSH_PRIV_MCXT(PTRSZ [FRAME_ALIGNMENT - ARG_SZ + REG_XSP -\ |
| PUSH_PRIV_MCXT_PRE_PC_SHIFT]) /* return address as pc */ |
| |
| /* do the rest in C */ |
| lea REG_XAX, [REG_XSP] /* stack grew down, so priv_mcontext_t at tos */ |
| CALLC1(GLOBAL_REF(dynamorio_app_take_over_helper), REG_XAX) |
| |
| /* if we come back, then DR is not taking control so |
| * clean up stack and return */ |
| add REG_XSP, PRIV_MCXT_SIZE + FRAME_ALIGNMENT - ARG_SZ |
| ret |
| END_FUNC(dynamorio_app_take_over) |
| |
| /* |
| * cleanup_and_terminate(dcontext_t *dcontext, // 1*ARG_SZ+XBP |
| * int sysnum, // 2*ARG_SZ+XBP = syscall # |
| * int sys_arg1/param_base, // 3*ARG_SZ+XBP = arg1 for syscall |
| * int sys_arg2, // 4*ARG_SZ+XBP = arg2 for syscall |
| * bool exitproc, // 7*ARG_SZ+XBP |
| * (these 2 args are only used for Mac thread exit:) |
| * int sys_arg3, // 5*ARG_SZ+XBP = arg3 for syscall |
| * int sys_arg4) // 6*ARG_SZ+XBP = arg4 for syscall |
| * |
| * Calls dynamo_exit_process if exitproc is true, else calls dynamo_exit_thread. |
| * Uses the current dstack, but instructs the cleanup routines not to |
| * de-allocate it, does a custom de-allocate after swapping to initstack (don't |
| * want to use initstack the whole time, that's too long to hold the mutex). |
| * Then calls system call sysnum with parameter base param_base, which is presumed |
| * to be either NtTerminateThread or NtTerminateProcess or exit. |
| * For x64 Windows, args are in ecx and edx (terminate syscalls have only 2 args). |
| * For x64 Linux, 1st 2 args are in rdi and rsi. |
| * |
| * Note that the caller is responsible for placing the actual syscall arguments |
| * at the correct offset from edx (or ebx). See SYSCALL_PARAM_OFFSET in |
| * win32 os.c for more info. |
| * |
| * Note that this routine does not return and thus clobbers callee-saved regs. |
| */ |
| DECLARE_FUNC(cleanup_and_terminate) |
| GLOBAL_LABEL(cleanup_and_terminate:) |
| /* get all args with same offset(xbp) regardless of plaform, to save |
| * across our calls. |
| */ |
| #ifdef X64 |
| # ifdef WINDOWS |
| mov REG_XBP, REG_XSP |
| lea REG_XSP, [-ARG_SZ + REG_XSP] /* maintain align-16: offset retaddr */ |
| # else |
| /* no padding so we make our own space. odd #slots keeps align-16 w/ retaddr */ |
| lea REG_XSP, [-5*ARG_SZ + REG_XSP] |
| /* xbp points one beyond TOS to get same offset as having retaddr there */ |
| lea REG_XBP, [-ARG_SZ + REG_XSP] |
| mov [5*ARG_SZ + REG_XBP], ARG5 |
| mov [6*ARG_SZ + REG_XBP], ARG6 |
| mov REG_XAX, ARG7 |
| mov [7*ARG_SZ + REG_XBP], REG_XAX |
| # endif |
| mov [1*ARG_SZ + REG_XBP], ARG1 |
| mov [2*ARG_SZ + REG_XBP], ARG2 |
| mov [3*ARG_SZ + REG_XBP], ARG3 |
| mov [4*ARG_SZ + REG_XBP], ARG4 |
| #else |
| mov REG_XBP, REG_XSP |
| # if defined(MACOS) && !defined(X64) |
| lea REG_XSP, [-3*ARG_SZ + REG_XSP] /* maintain align-16: offset retaddr */ |
| # endif |
| #endif |
| /* increment exiting_thread_count so that we don't get killed after |
| * thread_exit removes us from the all_threads list */ |
| #if !defined(X64) && defined(LINUX) |
| /* PR 212290: avoid text relocations: get PIC base into callee-saved xdi. |
| * Can't use CALLC0 since it inserts a nop: we need the exact retaddr. |
| */ |
| call get_pic_xdi |
| lea REG_XDI, [_GLOBAL_OFFSET_TABLE_ + REG_XDI] |
| lea REG_XAX, VAR_VIA_GOT(REG_XDI, GLOBAL_REF(exiting_thread_count)) |
| lock inc DWORD [REG_XAX] |
| #else |
| lock inc DWORD SYMREF(exiting_thread_count) /* rip-rel for x64 */ |
| #endif |
| /* save dcontext->dstack for freeing later and set dcontext->is_exiting */ |
| /* xbx is callee-saved and not an x64 param so we can use it */ |
| mov REG_XBX, PTRSZ [1*ARG_SZ + REG_XBP] /* dcontext */ |
| SAVE_TO_DCONTEXT_VIA_REG(REG_XBX,is_exiting_OFFSET,1) |
| CALLC1(GLOBAL_REF(is_currently_on_dstack), REG_XBX) /* xbx is callee-saved */ |
| cmp REG_XAX, 0 |
| jnz cat_save_dstack |
| mov REG_XBX, 0 /* save 0 for dstack to avoid double-free */ |
| jmp cat_done_saving_dstack |
| cat_save_dstack: |
| RESTORE_FROM_DCONTEXT_VIA_REG(REG_XBX,dstack_OFFSET,REG_XBX) |
| cat_done_saving_dstack: |
| /* PR 306421: xbx is callee-saved for all platforms, so don't push yet, |
| * to maintain 16-byte stack alignment |
| */ |
| /* avoid sygate sysenter version as our stack may be static const at |
| * that point, caller will take care of sygate hack */ |
| CALLC0(GLOBAL_REF(get_cleanup_and_terminate_global_do_syscall_entry)) |
| #if defined(MACOS) && !defined(X64) |
| lea REG_XSP, [-2*ARG_SZ + REG_XSP] /* maintain align-16 w/ 2 pushes below */ |
| #endif |
| push REG_XBX /* 16-byte aligned again */ |
| push REG_XAX |
| /* upper bytes are 0xab so only look at lower bytes */ |
| movzx esi, BYTE [5*ARG_SZ + REG_XBP] /* exitproc */ |
| cmp esi, 0 |
| jz cat_thread_only |
| CALLC0(GLOBAL_REF(dynamo_process_exit)) |
| jmp cat_no_thread |
| cat_thread_only: |
| CALLC0(GLOBAL_REF(dynamo_thread_exit)) |
| cat_no_thread: |
| /* now switch to initstack for cleanup of dstack |
| * could use initstack for whole thing but that's too long |
| * of a time to hold global initstack_mutex */ |
| mov ecx, 1 |
| #if !defined(X64) && defined(LINUX) |
| /* PIC base is still in xdi */ |
| lea REG_XAX, VAR_VIA_GOT(REG_XDI, GLOBAL_REF(initstack_mutex)) |
| #endif |
| cat_spin: |
| #if !defined(X64) && defined(LINUX) |
| xchg DWORD [REG_XAX], ecx |
| #else |
| xchg DWORD SYMREF(initstack_mutex), ecx /* rip-relative on x64 */ |
| #endif |
| jecxz cat_have_lock |
| /* try again -- too few free regs to call sleep() */ |
| pause /* good thing gas now knows about pause */ |
| jmp cat_spin |
| cat_have_lock: |
| /* need to grab everything off dstack first */ |
| #ifdef WINDOWS |
| /* PR 601533: the wow64 syscall writes to the stack b/c it |
| * makes a call, so we have a race that can lead to a hang or |
| * worse. we do not expect the syscall to return, so we can |
| * use a global single-entry stack (the wow64 layer swaps to a |
| * different stack: presumably for alignment and other reasons). |
| */ |
| CALLC1(GLOBAL_REF(os_terminate_wow64_stack), -1/*INVALID_HANDLE_VALUE*/) |
| mov REG_XDI, REG_XAX /* esp to use */ |
| #endif |
| mov REG_XSI, [2*ARG_SZ + REG_XBP] /* sysnum */ |
| pop REG_XAX /* syscall */ |
| pop REG_XCX /* dstack */ |
| #if defined(MACOS) && !defined(X64) |
| lea REG_XSP, [2*ARG_SZ + REG_XSP] /* undo align-16 lea from above */ |
| #endif |
| mov REG_XBX, REG_XBP /* save for arg access after swapping stacks */ |
| /* swap stacks */ |
| #if !defined(X64) && defined(LINUX) |
| /* PIC base is still in xdi */ |
| lea REG_XBP, VAR_VIA_GOT(REG_XDI, GLOBAL_REF(initstack)) |
| mov REG_XSP, PTRSZ [REG_XBP] |
| #else |
| mov REG_XSP, PTRSZ SYMREF(initstack) /* rip-relative on x64 */ |
| #endif |
| /* now save registers */ |
| #if defined(MACOS) && !defined(X64) |
| cmp BYTE [5*ARG_SZ + REG_XBX], 0 /* exitproc */ |
| jz cat_thread_only2 |
| /* ensure aligned after 1st 2 arg pushes below, which are the syscall args */ |
| lea REG_XSP, [-2*ARG_SZ + REG_XSP] |
| jmp cat_no_thread2 |
| cat_thread_only2: /* for thread, the 4 pushes make it aligned */ |
| push PTRSZ [7*ARG_SZ + REG_XBX] /* sys_arg4 */ |
| push PTRSZ [6*ARG_SZ + REG_XBX] /* sys_arg3 */ |
| cat_no_thread2: |
| #endif |
| #ifdef WINDOWS |
| push REG_XDI /* esp to use */ |
| #endif |
| push PTRSZ [4*ARG_SZ + REG_XBX] /* sys_arg2 */ |
| push PTRSZ [3*ARG_SZ + REG_XBX] /* sys_arg1 */ |
| push REG_XAX /* syscall */ |
| push REG_XSI /* sysnum => xsp 16-byte aligned for x64 and x86 */ |
| #if defined(MACOS) && !defined(X64) |
| lea REG_XSP, [-2*ARG_SZ + REG_XSP] /* align to 16 for this call */ |
| #endif |
| /* free dstack and call the EXIT_DR_HOOK */ |
| CALLC1(GLOBAL_REF(dynamo_thread_stack_free_and_exit), REG_XCX) /* pass dstack */ |
| #if defined(MACOS) && !defined(X64) |
| lea REG_XSP, [2*ARG_SZ + REG_XSP] /* undo align to 16 */ |
| #endif |
| /* finally, execute the termination syscall */ |
| pop REG_XAX /* sysnum */ |
| #ifdef X64 |
| /* We assume we're doing "syscall" on Windows & Linux, where r10 is dead */ |
| pop r10 /* syscall, in reg dead at syscall */ |
| # ifdef UNIX |
| pop REG_XDI /* sys_arg1 */ |
| pop REG_XSI /* sys_arg2 */ |
| # else |
| pop REG_XCX /* sys_arg1 */ |
| pop REG_XDX /* sys_arg2 */ |
| # endif |
| #else |
| pop REG_XSI /* syscall */ |
| # ifdef MACOS |
| /* Leave the args on the stack for 32-bit Mac. We actually need another |
| * slot before the 1st arg (usually the retaddr for app syscall). |
| * This ends up with stack alignment of 0xc, which is what we want. |
| */ |
| push 0 |
| # elif defined(LINUX) |
| pop REG_XBX /* sys_arg1 */ |
| pop REG_XCX /* sys_arg2 */ |
| # else |
| pop REG_XDX /* sys_arg1 == param_base */ |
| pop REG_XCX /* sys_arg2 (unused) */ |
| # endif |
| #endif |
| #ifdef WINDOWS |
| pop REG_XSP /* get the stack pointer we pushed earlier */ |
| #endif |
| /* give up initstack mutex -- potential problem here with a thread getting |
| * an asynch event that then uses initstack, but syscall should only care |
| * about ebx and edx */ |
| #if !defined(X64) && defined(LINUX) |
| /* PIC base is still in xdi */ |
| lea REG_XBP, VAR_VIA_GOT(REG_XDI, initstack_mutex) |
| mov DWORD [REG_XBP], 0 |
| #else |
| mov DWORD SYMREF(initstack_mutex), 0 /* rip-relative on x64 */ |
| #endif |
| /* we are finished with all shared resources, decrement the |
| * exiting_thread_count (allows another thread to kill us) */ |
| #if !defined(X64) && defined(LINUX) |
| /* PIC base is still in xdi */ |
| lea REG_XBP, VAR_VIA_GOT(REG_XDI, GLOBAL_REF(exiting_thread_count)) |
| lock dec DWORD [REG_XBP] |
| #else |
| lock dec DWORD SYMREF(exiting_thread_count) /* rip-rel on x64 */ |
| #endif |
| #ifdef X64 |
| jmp r10 /* go do the syscall! */ |
| #else |
| jmp REG_XSI /* go do the syscall! */ |
| #endif |
| END_FUNC(cleanup_and_terminate) |
| |
| /* global_do_syscall_int |
| * Caller is responsible for all set up. For windows this means putting the |
| * syscall num in eax and putting the args at edx. For linux this means putting |
| * the syscall num in eax, and the args in ebx, ecx, edx, esi, edi and ebp (in |
| * that order, as needed). global_do_syscall is only used with system calls |
| * that we don't expect to return, so for debug builds we go into an infinite |
| * loop if syscall returns. |
| */ |
| DECLARE_FUNC(global_do_syscall_int) |
| GLOBAL_LABEL(global_do_syscall_int:) |
| #ifdef WINDOWS |
| int HEX(2e) |
| #else |
| /* XXX: if we need to make any Mach syscalls for MacOS here, we'll |
| * need a sysenter version, as the kernel throws SIGSYS when using int. |
| */ |
| int HEX(80) |
| #endif |
| #ifdef DEBUG |
| jmp GLOBAL_REF(debug_infinite_loop) |
| #endif |
| #ifdef UNIX |
| /* we do come here for SYS_kill which can fail: try again via exit_group */ |
| jmp GLOBAL_REF(dynamorio_sys_exit_group) |
| #endif |
| END_FUNC(global_do_syscall_int) |
| |
| /* For sygate hack need to indirect the system call through ntdll. */ |
| #ifdef WINDOWS |
| DECLARE_FUNC(global_do_syscall_sygate_int) |
| GLOBAL_LABEL(global_do_syscall_sygate_int:) |
| /* would be nicer to call so we could return to debug_infinite_loop on |
| * failure, but on some paths (cleanup_and_terminate) we can no longer |
| * safetly use the stack */ |
| jmp PTRSZ SYMREF(int_syscall_address) |
| END_FUNC(global_do_syscall_sygate_int) |
| #endif |
| |
| /* global_do_syscall_sysenter |
| * Caller is responsible for all set up, this means putting the syscall num |
| * in eax and putting the args at edx+8 (windows specific, we don't yet support |
| * linux sysenter). global_do_syscall is only used with system calls that we |
| * don't expect to return. As edx becomes esp, if the syscall does return it |
| * will go to the address in [edx] (again windows specific) (if any debugging |
| * code is desired should be pointed to there, do note that edx will become esp |
| * so be aware of stack limitations/protections). |
| */ |
| DECLARE_FUNC(global_do_syscall_sysenter) |
| GLOBAL_LABEL(global_do_syscall_sysenter:) |
| #if defined(X64) && defined(WINDOWS) |
| syscall /* FIXME ml64 won't take "sysenter" so half-fixing now */ |
| #else |
| sysenter |
| #endif |
| #ifdef DEBUG |
| /* We'll never ever reach here, sysenter won't/can't return to this |
| * address since it doesn't know it, but we'll put in a jmp to |
| * debug_infinite_loop just in case */ |
| jmp GLOBAL_REF(debug_infinite_loop) |
| #endif |
| END_FUNC(global_do_syscall_sysenter) |
| |
| /* Sygate case 5441 hack - the return address (edx) needs to point to |
| * ntdll to pass their verification. Global_do_syscall is really only |
| * used with system calls that aren't expected to return so we don't |
| * have to be too careful. Just shuffle the stack using the sysret addr. |
| * If there is already a return address we'll keep that (just move down |
| * a slot). |
| */ |
| #ifdef WINDOWS |
| DECLARE_FUNC(global_do_syscall_sygate_sysenter) |
| GLOBAL_LABEL(global_do_syscall_sygate_sysenter:) |
| mov REG_XSP, REG_XDX |
| /* move existing ret down a slot (note target address is |
| * computed with already inc'ed esp [see intel docs]) */ |
| pop PTRSZ [REG_XSP] |
| push PTRSZ SYMREF(sysenter_ret_address) |
| #if defined(X64) && defined(WINDOWS) |
| syscall /* FIXME ml64 won't take "sysenter" so half-fixing now */ |
| #else |
| sysenter |
| #endif |
| #ifdef DEBUG |
| /* We'll never ever reach here, sysenter won't/can't return to this |
| * address since it doesn't know it, but we'll put in a jmp to |
| * debug_infinite_loop just in case */ |
| jmp GLOBAL_REF(debug_infinite_loop) |
| #endif |
| END_FUNC(global_do_syscall_sygate_sysenter) |
| #endif |
| |
| /* Both Windows and Linux put rcx into r10 since rcx is used as the return addr */ |
| #ifdef X64 |
| /* global_do_syscall_syscall |
| * Caller is responsible for all set up: putting the syscall num in eax |
| * and the args in registers/memory. Only used with system calls |
| * that we don't expect to return, so for debug builds we go into an infinite |
| * loop if syscall returns. |
| */ |
| DECLARE_FUNC(global_do_syscall_syscall) |
| GLOBAL_LABEL(global_do_syscall_syscall:) |
| mov r10, REG_XCX |
| syscall |
| # ifdef DEBUG |
| jmp GLOBAL_REF(debug_infinite_loop) |
| # endif |
| # ifdef UNIX |
| /* we do come here for SYS_kill which can fail: try again via exit_group */ |
| jmp GLOBAL_REF(dynamorio_sys_exit_group) |
| # endif |
| END_FUNC(global_do_syscall_syscall) |
| #endif |
| |
| #ifdef WINDOWS |
| |
| /* global_do_syscall_wow64 |
| * Xref case 3922 |
| * Caller is responsible for all set up: putting the syscall num in eax, |
| * the wow64 index into ecx, and the args in edx. Only used with system calls |
| * that we don't expect to return, so for debug builds we go into an infinite |
| * loop if syscall returns. |
| */ |
| DECLARE_FUNC(global_do_syscall_wow64) |
| GLOBAL_LABEL(global_do_syscall_wow64:) |
| call PTRSZ SEGMEM(fs,HEX(0c0)) |
| #ifdef DEBUG |
| jmp GLOBAL_REF(debug_infinite_loop) |
| #endif |
| END_FUNC(global_do_syscall_wow64) |
| |
| /* global_do_syscall_wow64_index0 |
| * Sames as global_do_syscall_wow64, except zeros out ecx. |
| */ |
| DECLARE_FUNC(global_do_syscall_wow64_index0) |
| GLOBAL_LABEL(global_do_syscall_wow64_index0:) |
| xor ecx, ecx |
| call PTRSZ SEGMEM(fs,HEX(0c0)) |
| #ifdef DEBUG |
| jmp GLOBAL_REF(debug_infinite_loop) |
| #endif |
| END_FUNC(global_do_syscall_wow64_index0) |
| |
| #endif /* WINDOWS */ |
| |
| #ifdef DEBUG |
| /* Just an infinite CPU eating loop used to mark certain failures. |
| */ |
| DECLARE_FUNC(debug_infinite_loop) |
| GLOBAL_LABEL(debug_infinite_loop:) |
| jmp GLOBAL_REF(debug_infinite_loop) |
| END_FUNC(debug_infinite_loop) |
| #endif |
| |
| #ifdef WINDOWS |
| /* We use our own syscall wrapper for key win32 system calls. |
| * |
| * We would use a dynamically generated routine created by decoding |
| * a real ntdll wrapper and tweaking it, but we need to use |
| * this for our own syscalls and have a bootstrapping problem -- so |
| * rather than hacking to get the power to decode w/o a heap, we hardcode |
| * the types we support here. |
| * |
| * We assume that all syscall wrappers are identical, and they have |
| * specific instruction sequences -- thus this routine needs to be updated |
| * with any syscall sequence change in a future version of ntdll.dll! |
| * |
| * We construct our own minimalist versions that use C calling convention |
| * and take as a first argument the system call number: |
| * |
| * ref case 5217, for Sygate compatibility the int needs to come from |
| * ntdll.dll, we use a call to NtYieldExecution+9 (int 2e; ret;) |
| * |
| * 1) mov immed, eax mov 4(esp), eax |
| * lea 4(esp), edx ==> lea 8(esp), edx |
| * int 2e int 2e |
| * ret 4*numargs ret |
| * |
| * 2) mov immed, eax mov 4(esp), eax |
| * mov 0x7ffe0300, edx mov esp, edx |
| * call {edx,(edx)} < juggle stack, see below > |
| * NOTE - to support the sygate case 5441 hack the actual instructions |
| * - we use are different, but the end up doing the same thing |
| * callee: ==> sysenter |
| * mov esp, edx our_ret: |
| * sysenter ret |
| * ret |
| * ret 4*numargs |
| * |
| * => signature: dynamorio_syscall_{int2e,sysenter}(sysnum, arg1, arg2, ...) |
| */ |
| DECLARE_FUNC(dynamorio_syscall_int2e) |
| GLOBAL_LABEL(dynamorio_syscall_int2e:) |
| mov eax, [4 + esp] |
| lea edx, [8 + esp] |
| int HEX(2e) |
| ret |
| END_FUNC(dynamorio_syscall_int2e) |
| |
| DECLARE_FUNC(dynamorio_syscall_sygate_int2e) |
| GLOBAL_LABEL(dynamorio_syscall_sygate_int2e:) |
| mov eax, [4 + esp] |
| lea edx, [8 + esp] |
| call PTRSZ SYMREF(int_syscall_address) |
| ret |
| END_FUNC(dynamorio_syscall_sygate_int2e) |
| |
| DECLARE_FUNC(dynamorio_syscall_sysenter) |
| GLOBAL_LABEL(dynamorio_syscall_sysenter:) |
| /* esp + 0 return address |
| * 4 syscall num |
| * 8+ syscall args |
| * Ref case 5461 edx serves as both the argument pointer (edx+8) and the |
| * top of stack for the kernel sysexit. */ |
| mov eax, [4 + esp] |
| mov REG_XDX, REG_XSP |
| #if defined(X64) && defined(WINDOWS) |
| syscall /* FIXME ml64 won't take "sysenter" so half-fixing now */ |
| #else |
| sysenter |
| #endif |
| /* Kernel sends control to hardcoded location, which does ret, |
| * which will return directly back to the caller. Thus the following |
| * ret will never execute. */ |
| ret |
| END_FUNC(dynamorio_syscall_sysenter) |
| |
| DECLARE_GLOBAL(dynamorio_mach_syscall_fixup) |
| DECLARE_FUNC(dynamorio_syscall_sygate_sysenter) |
| GLOBAL_LABEL(dynamorio_syscall_sygate_sysenter:) |
| /* stack looks like: |
| * esp + 0 return address |
| * 4 syscall num |
| * 8+ syscall args |
| * Ref case 5461 edx serves as both the argument pointer (edx+8) and the |
| * top of stack for the kernel sysexit. While we could do nothing and |
| * just have the sysenter return straight back to the caller, we use |
| * sysenter_ret_address indirection to support the Sygate compatibility |
| * fix for case 5441 where steal a ret from ntdll.dll so need to mangle |
| * our stack to look like |
| * esp + 0 sysenter_ret_address |
| * 4 dynamorio_mach_syscall_fixup |
| * 8+ syscall args |
| * sysenter_tls_slot return address |
| * before we do the edx <- esp |
| * |
| * NOTE - we can NOT just have |
| * esp + 0 sysenter_ret_address |
| * 4 return address |
| * 8 args |
| * as even though this will go the right place, the stack will be one |
| * off on the return (debug builds with frame ptr are ok, but not |
| * release). We could roll our own custom calling convention for this |
| * but would be a pain given how this function is called. So we use a |
| * tls slot to store the return address around the system call since |
| * there isn't room on the stack, thus is not re-entrant, but neither is |
| * dr and we don't make alertable system calls. An alternate scheme |
| * kept the return address off the top of the stack which works fine |
| * (nothing alertable), but just seemed too risky. |
| * FIXME - any perf impact from breaking hardware return predictor */ |
| pop REG_XDX |
| mov eax, DWORD SYMREF(sysenter_tls_offset) |
| mov SEGMEM(fs,eax), edx |
| pop REG_XAX |
| #ifdef X64 |
| /* Can't push a 64-bit immed */ |
| mov REG_XCX, dynamorio_mach_syscall_fixup |
| push REG_XCX |
| #else |
| push dynamorio_mach_syscall_fixup |
| #endif |
| push PTRSZ SYMREF(sysenter_ret_address) |
| mov REG_XDX, REG_XSP |
| #if defined(X64) && defined(WINDOWS) |
| syscall /* FIXME ml64 won't take "sysenter" so half-fixing now */ |
| #else |
| sysenter |
| #endif |
| ADDRTAKEN_LABEL(dynamorio_mach_syscall_fixup:) |
| /* push whatever (was the slot for the eax arg) */ |
| push REG_XAX |
| /* ecx/edx should be dead here, just borrow one */ |
| mov edx, DWORD SYMREF(sysenter_tls_offset) |
| push PTRSZ SEGMEM(fs,edx) |
| ret |
| END_FUNC(dynamorio_syscall_sygate_sysenter) |
| |
| # ifdef X64 |
| /* With the 1st 4 args in registers, we don't want the sysnum to shift them |
| * all as it's not easy to un-shift. So, we put the 1st arg last, and |
| * the SYS enum value first. We use the syscall_argsz array to restore |
| * the 1st arg. Since the return value is never larger than 64 bits, we |
| * never have to worry about a hidden 1st arg that shifts the rest. |
| */ |
| DECLARE_FUNC(dynamorio_syscall_syscall) |
| GLOBAL_LABEL(dynamorio_syscall_syscall:) |
| mov rax, QWORD SYMREF(syscalls) |
| /* the upper 32 bits are automatically zeroed */ |
| mov eax, DWORD [rax + ARG1*4] /* sysnum in rax */ |
| mov r10, syscall_argsz |
| /* the upper 32 bits are automatically zeroed */ |
| mov r10d, DWORD [r10 + ARG1*4] /* # args in r10 */ |
| cmp r10, 0 |
| je dynamorio_syscall_syscall_ready |
| cmp r10, 1 |
| je dynamorio_syscall_syscall_1arg |
| cmp r10, 2 |
| je dynamorio_syscall_syscall_2arg |
| cmp r10, 3 |
| je dynamorio_syscall_syscall_3arg |
| /* else, >= 4 args, so pull from arg slot of (#args + 1) */ |
| mov ARG1, QWORD [rsp + r10*8 + 8] |
| jmp dynamorio_syscall_syscall_ready |
| dynamorio_syscall_syscall_1arg: |
| mov ARG1, ARG2 |
| jmp dynamorio_syscall_syscall_ready |
| dynamorio_syscall_syscall_2arg: |
| mov ARG1, ARG3 |
| jmp dynamorio_syscall_syscall_ready |
| dynamorio_syscall_syscall_3arg: |
| mov ARG1, ARG4 |
| /* fall-through */ |
| dynamorio_syscall_syscall_ready: |
| mov r10, rcx /* put rcx in r10 just like Nt wrappers (syscall writes rcx) */ |
| syscall |
| ret |
| END_FUNC(dynamorio_syscall_syscall) |
| # endif |
| |
| /* For WOW64 (case 3922) the syscall wrappers call *teb->WOW32Reserved (== |
| * wow64cpu!X86SwitchTo64BitMode), which is a far jmp that switches to the |
| * 64-bit cs segment (0x33 selector). They pass in ecx an index into |
| * a function table of argument conversion routines. |
| * |
| * 3) mov sysnum, eax |
| * mov tableidx, ecx |
| * call *fs:0xc0 |
| * callee: |
| * jmp 0x33:wow64cpu!CpupReturnFromSimulatedCode |
| * ret 4*numargs |
| * |
| * rather than taking in sysnum and tableidx, we take in sys_enum and |
| * look up the sysnum and tableidx to keep the same args as the other |
| * dynamorio_syscall_* routines |
| * => signature: dynamorio_syscall_wow64(sys_enum, arg1, arg2, ...) |
| */ |
| DECLARE_FUNC(dynamorio_syscall_wow64) |
| GLOBAL_LABEL(dynamorio_syscall_wow64:) |
| mov eax, [4 + esp] |
| mov edx, DWORD SYMREF(wow64_index) |
| mov ecx, [edx + eax*4] |
| mov edx, DWORD SYMREF(syscalls) |
| mov eax, [edx + eax*4] |
| lea edx, [8 + esp] |
| call PTRSZ SEGMEM(fs,HEX(0c0)) |
| ret |
| END_FUNC(dynamorio_syscall_wow64) |
| |
| /* Win8 has no index and furthermore requires the stack to be set |
| * up (i.e., we can't just point edx where we want it). |
| * Thus, we must shift the retaddr one slot down on top of sys_enum. |
| * => signature: dynamorio_syscall_wow64_noedx(sys_enum, arg1, arg2, ...) |
| */ |
| DECLARE_FUNC(dynamorio_syscall_wow64_noedx) |
| GLOBAL_LABEL(dynamorio_syscall_wow64_noedx:) |
| mov eax, [4 + esp] |
| mov ecx, DWORD SYMREF(syscalls) |
| mov eax, [ecx + eax*4] |
| mov ecx, [esp] |
| mov [esp + 4], ecx |
| lea esp, [esp + 4] |
| call PTRSZ SEGMEM(fs,HEX(0c0)) |
| /* we have to restore the stack shift of course (i#1036) */ |
| mov ecx, [esp] |
| mov [esp - 4], ecx |
| lea esp, [esp - 4] |
| ret |
| END_FUNC(dynamorio_syscall_wow64_noedx) |
| |
| #endif /* WINDOWS */ |
| |
| #endif /* !NOT_DYNAMORIO_CORE_PROPER */ |
| |
| /* we share dynamorio_syscall w/ preload */ |
| #ifdef UNIX |
| /* to avoid libc wrappers we roll our own syscall here |
| * hardcoded to use int 0x80 for 32-bit -- FIXME: use something like do_syscall |
| * and syscall for 64-bit. |
| * signature: dynamorio_syscall(sysnum, num_args, arg1, arg2, ...) |
| * For Linux, the argument max is 6. |
| * For MacOS, the argument max is 6 for x64 and 7 for x86. |
| */ |
| DECLARE_FUNC(dynamorio_syscall) |
| GLOBAL_LABEL(dynamorio_syscall:) |
| /* x64 kernel doesn't clobber all the callee-saved registers */ |
| push REG_XBX /* stack now aligned for x64 */ |
| # ifdef X64 |
| /* reverse order so we don't clobber earlier args */ |
| mov REG_XBX, ARG2 /* put num_args where we can reference it longer */ |
| mov rax, ARG1 /* sysnum: only need eax, but need rax to use ARG1 (or movzx) */ |
| # ifdef MACOS |
| /* for now we assume a BSD syscall */ |
| or rax, 0x2000000 |
| # endif |
| cmp REG_XBX, 0 |
| je syscall_ready |
| mov ARG1, ARG3 |
| cmp REG_XBX, 1 |
| je syscall_ready |
| mov ARG2, ARG4 |
| cmp REG_XBX, 2 |
| je syscall_ready |
| mov ARG3, ARG5 |
| cmp REG_XBX, 3 |
| je syscall_ready |
| mov ARG4, ARG6 |
| cmp REG_XBX, 4 |
| je syscall_ready |
| mov ARG5, [2*ARG_SZ + REG_XSP] /* arg7: above xbx and retaddr */ |
| cmp REG_XBX, 5 |
| je syscall_ready |
| mov ARG6, [3*ARG_SZ + REG_XSP] /* arg8: above arg7, xbx, retaddr */ |
| syscall_ready: |
| mov r10, rcx |
| syscall |
| # else |
| push REG_XBP |
| push REG_XSI |
| push REG_XDI |
| /* add 16 to skip the 4 pushes |
| * XXX: rather than this dispatch, could have separate routines |
| * for each #args, or could just blindly read upward on the stack. |
| * for dispatch, if assume size of mov instr can do single ind jmp */ |
| mov ecx, [16+ 8 + esp] /* num_args */ |
| cmp ecx, 0 |
| je syscall_0args |
| cmp ecx, 1 |
| je syscall_1args |
| cmp ecx, 2 |
| je syscall_2args |
| cmp ecx, 3 |
| je syscall_3args |
| cmp ecx, 4 |
| je syscall_4args |
| cmp ecx, 5 |
| je syscall_5args |
| # ifdef MACOS |
| cmp ecx, 6 |
| je syscall_6args |
| # ifdef INTERNAL |
| cmp ecx, 7 |
| jg GLOBAL_REF(unexpected_return) |
| # endif |
| mov eax, [16+36 + esp] /* arg7 */ |
| syscall_6args: |
| # elif defined(INTERNAL) |
| cmp ecx, 6 |
| jg GLOBAL_REF(unexpected_return) |
| # endif |
| mov ebp, [16+32 + esp] /* arg6 */ |
| syscall_5args: |
| mov edi, [16+28 + esp] /* arg5 */ |
| syscall_4args: |
| mov esi, [16+24 + esp] /* arg4 */ |
| syscall_3args: |
| mov edx, [16+20 + esp] /* arg3 */ |
| syscall_2args: |
| mov ecx, [16+16 + esp] /* arg2 */ |
| syscall_1args: |
| mov ebx, [16+12 + esp] /* arg1 */ |
| syscall_0args: |
| # ifdef MACOS |
| push eax /* 7th arg, if any */ |
| /* Arg size is encoded in upper bits. |
| * XXX: or is that only for sysenter gateway? |
| * We assume this is size, not count, and so for our "7 arg" |
| * call that's really 6 with one 64-bit we leave it. |
| */ |
| mov eax, [20+ 8 + esp] /* num_args */ |
| shl eax, 18 /* <<16 but also *4 for size */ |
| or eax, [20+ 4 + esp] /* sysnum */ |
| /* args are on stack, w/ an extra slot (retaddr of syscall wrapper) */ |
| push ebp |
| push edi |
| push esi |
| push edx |
| push ecx |
| push ebx /* aligned to 16 after this push */ |
| push 0 /* extra slot (app retaddr) */ |
| /* It simplifies our syscall calling to have a single dynamorio_syscall() |
| * signature that returns int64 -- but most syscalls just return a 32-bit |
| * value and the kernel does not clear edx. Thus we need to do so, which |
| * should be safe since edx is caller-saved. (Note that we do not risk |
| * doing this for app syscalls: only those called by DR.) |
| */ |
| mov edx, 0 |
| # else |
| mov eax, [16+ 4 + esp] /* sysnum */ |
| # endif |
| /* PR 254280: we assume int$80 is ok even for LOL64, maybe slow is all. |
| * For Mac, it's possible to do sysenter here as we can store the retaddr |
| * in edx ourselves (in fact see r2514 dynamorio_syscall_sysenter for an |
| * implementation, now removed), but we still need int for certain syscalls |
| * (returning 64-bit values, e.g.) so we go w/ int always and assume our |
| * syscall perf doesn't matter much (should be rare). |
| */ |
| int HEX(80) |
| # ifdef MACOS |
| lea esp, [8*ARG_SZ + esp] /* must not change flags */ |
| # endif |
| pop REG_XDI |
| pop REG_XSI |
| pop REG_XBP |
| # endif /* X64 */ |
| pop REG_XBX |
| /* return val is in eax for us */ |
| /* for MacOS, it can also include edx, so be sure not to clobber that! */ |
| # ifdef MACOS |
| /* convert to -errno */ |
| jae syscall_success |
| neg eax |
| syscall_success: |
| # endif |
| ret |
| END_FUNC(dynamorio_syscall) |
| |
| # ifdef MACOS |
| /* Mach dep syscall invocation. |
| * Signature: dynamorio_mach_dep_syscall(sysnum, num_args, arg1, arg2, ...) |
| * Only supports up to 4 args. |
| */ |
| DECLARE_FUNC(dynamorio_mach_dep_syscall) |
| GLOBAL_LABEL(dynamorio_mach_dep_syscall:) |
| /* x64 kernel doesn't clobber all the callee-saved registers */ |
| push REG_XBX |
| # ifdef X64 |
| /* reverse order so we don't clobber earlier args */ |
| mov REG_XBX, ARG2 /* put num_args where we can reference it longer */ |
| mov rax, ARG1 /* sysnum: only need eax, but need rax to use ARG1 (or movzx) */ |
| cmp REG_XBX, 0 |
| je mach_dep_syscall_ready |
| mov ARG1, ARG3 |
| cmp REG_XBX, 1 |
| je mach_dep_syscall_ready |
| mov ARG2, ARG4 |
| cmp REG_XBX, 2 |
| je mach_dep_syscall_ready |
| mov ARG3, ARG5 |
| cmp REG_XBX, 3 |
| je mach_dep_syscall_ready |
| mov ARG4, ARG6 |
| # else |
| push REG_XBP |
| push REG_XSI |
| push REG_XDI |
| /* add 16 to skip the 4 pushes */ |
| mov ecx, [16+ 8 + esp] /* num_args */ |
| cmp ecx, 0 |
| je mach_dep_syscall_0args |
| cmp ecx, 1 |
| je mach_dep_syscall_1args |
| cmp ecx, 2 |
| je mach_dep_syscall_2args |
| cmp ecx, 3 |
| je mach_dep_syscall_3args |
| mov esi, [16+24 + esp] /* arg4 */ |
| mach_dep_syscall_3args: |
| mov edx, [16+20 + esp] /* arg3 */ |
| mach_dep_syscall_2args: |
| mov ecx, [16+16 + esp] /* arg2 */ |
| mach_dep_syscall_1args: |
| mov ebx, [16+12 + esp] /* arg1 */ |
| mach_dep_syscall_0args: |
| mov eax, [16+ 4 + esp] /* sysnum */ |
| lea REG_XSP, [-2*ARG_SZ + REG_XSP] /* maintain align-16: retaddr-5th below */ |
| /* args are on stack, w/ an extra slot (retaddr of syscall wrapper) */ |
| push esi |
| push edx |
| push ecx |
| push ebx |
| push 0 /* extra slot */ |
| /* clear the top half so we can always consider the result 64-bit */ |
| mov edx, 0 |
| # endif |
| /* mach dep syscalls use interrupt 0x82 */ |
| int HEX(82) |
| # ifndef X64 |
| lea esp, [7*ARG_SZ + esp] /* must not change flags */ |
| pop REG_XDI |
| pop REG_XSI |
| pop REG_XBP |
| # endif |
| pop REG_XBX |
| /* return val is in eax for us */ |
| /* for MacOS, it can also include edx, so be sure not to clobber that! */ |
| /* convert to -errno */ |
| jae mach_dep_syscall_success |
| neg eax |
| mach_dep_syscall_success: |
| ret |
| END_FUNC(dynamorio_mach_dep_syscall) |
| |
| |
| /* Mach syscall invocation. |
| * Signature: ptr_int_t dynamorio_mach_syscall(sysnum, num_args, arg1, arg2, ...) |
| * Only supports up to 4 args. |
| * Does not support returning a 64-bit value in 32-bit mode. |
| */ |
| DECLARE_FUNC(dynamorio_mach_syscall) |
| GLOBAL_LABEL(dynamorio_mach_syscall:) |
| /* x64 kernel doesn't clobber all the callee-saved registers */ |
| push REG_XBX |
| # ifdef X64 |
| /* reverse order so we don't clobber earlier args */ |
| mov REG_XBX, ARG2 /* put num_args where we can reference it longer */ |
| mov rax, ARG1 /* sysnum: only need eax, but need rax to use ARG1 (or movzx) */ |
| cmp REG_XBX, 0 |
| je dynamorio_mach_syscall_ready |
| mov ARG1, ARG3 |
| cmp REG_XBX, 1 |
| je dynamorio_mach_syscall_ready |
| mov ARG2, ARG4 |
| cmp REG_XBX, 2 |
| je dynamorio_mach_syscall_ready |
| mov ARG3, ARG5 |
| cmp REG_XBX, 3 |
| je dynamorio_mach_syscall_ready |
| mov ARG4, ARG6 |
| # else |
| push REG_XBP |
| push REG_XSI |
| push REG_XDI |
| /* add 16 to skip the 4 pushes */ |
| mov ecx, [16+ 8 + esp] /* num_args */ |
| cmp ecx, 0 |
| je dynamorio_mach_syscall_0args |
| cmp ecx, 1 |
| je dynamorio_mach_syscall_1args |
| cmp ecx, 2 |
| je dynamorio_mach_syscall_2args |
| cmp ecx, 3 |
| je dynamorio_mach_syscall_3args |
| mov esi, [16+24 + esp] /* arg4 */ |
| dynamorio_mach_syscall_3args: |
| mov edx, [16+20 + esp] /* arg3 */ |
| dynamorio_mach_syscall_2args: |
| mov ecx, [16+16 + esp] /* arg2 */ |
| dynamorio_mach_syscall_1args: |
| mov ebx, [16+12 + esp] /* arg1 */ |
| dynamorio_mach_syscall_0args: |
| mov eax, [16+ 4 + esp] /* sysnum */ |
| # ifdef X64 |
| or eax, SYSCALL_NUM_MARKER_MACH |
| # else |
| /* The sysnum is passed as a negative number */ |
| neg eax |
| # endif |
| /* args are on stack, w/ an extra slot (retaddr of syscall wrapper) */ |
| lea REG_XSP, [-2*ARG_SZ + REG_XSP] /* maintain align-16: retaddr-5th below */ |
| /* args are on stack, w/ an extra slot (retaddr of syscall wrapper) */ |
| push esi |
| push edx |
| push ecx |
| push ebx |
| push 0 /* extra slot */ |
| # endif |
| /* If we use ADDRTAKEN_LABEL and GLOBAL_REF we get text relocation |
| * complaints so we instead do this hack: |
| */ |
| call dynamorio_mach_syscall_next |
| dynamorio_mach_syscall_next: |
| pop REG_XDX |
| lea REG_XDX, [1/*pop*/ + 3/*lea*/ + 2/*sysenter*/ + 2/*mov*/ + REG_XDX] |
| mov REG_XCX, REG_XSP |
| /* We have to use sysenter for a Mach syscall, else we get SIGSYS. |
| * This implies that we can't return 64-bit in 32-bit mode. |
| */ |
| sysenter |
| # ifndef X64 |
| lea esp, [7*ARG_SZ + esp] /* must not change flags */ |
| pop REG_XDI |
| pop REG_XSI |
| pop REG_XBP |
| # endif |
| pop REG_XBX |
| /* Return val is in eax for us. |
| * Note that unlike BSD and Machdep syscalls, Mach syscalls do not |
| * use flags to indicate success. |
| */ |
| ret |
| END_FUNC(dynamorio_mach_syscall) |
| |
| # endif /* MACOS */ |
| |
| /* FIXME: this function should be in #ifdef CLIENT_INTERFACE |
| * However, the compiler complains about it in |
| * vps-debug-internal-32 build, so we remove the ifdef now. |
| */ |
| /* i#555: to avoid client use app's vsyscall, we enforce all clients |
| * use int 0x80 for system call. |
| */ |
| DECLARE_FUNC(client_int_syscall) |
| GLOBAL_LABEL(client_int_syscall:) |
| int HEX(80) |
| ret |
| END_FUNC(client_int_syscall) |
| #endif /* UNIX */ |
| #ifndef NOT_DYNAMORIO_CORE_PROPER |
| #ifdef UNIX |
| |
| #ifdef LINUX /* XXX i#1285: implement MacOS private loader + injector */ |
| #if !defined(STANDALONE_UNIT_TEST) && !defined(STATIC_LIBRARY) |
| /* i#47: Early injection _start routine. The kernel sets all registers to zero |
| * except the SP and PC. The stack has argc, argv[], envp[], and the auxiliary |
| * vector laid out on it. |
| */ |
| DECLARE_FUNC(_start) |
| GLOBAL_LABEL(_start:) |
| xor REG_XBP, REG_XBP /* Terminate stack traces at NULL. */ |
| # ifdef X64 |
| mov ARG1, REG_XSP |
| # else |
| mov REG_XAX, REG_XSP |
| # ifdef MACOS |
| lea REG_XSP, [-3*ARG_SZ + REG_XSP] /* maintain align-16: offset retaddr */ |
| # endif |
| push REG_XAX |
| # endif |
| CALLC0(GLOBAL_REF(privload_early_inject)) |
| jmp GLOBAL_REF(unexpected_return) |
| END_FUNC(_start) |
| #endif /* !STANDALONE_UNIT_TEST && !STATIC_LIBRARY */ |
| #endif /* LINUX */ |
| |
| /* while with pre-2.6.9 kernels we were able to rely on the kernel's |
| * default sigreturn code sequence and be more platform independent, |
| * case 6700 necessitates having our own code, which for now, like |
| * dynamorio_syscall, hardcodes int 0x80 |
| */ |
| DECLARE_FUNC(dynamorio_sigreturn) |
| GLOBAL_LABEL(dynamorio_sigreturn:) |
| #ifdef X64 |
| mov eax, HEX(f) |
| mov r10, rcx |
| syscall |
| #else |
| # ifdef MACOS |
| /* we assume we don't need to align the stack (tricky to do so) */ |
| /* XXX: should we target _sigtramp instead? Some callers aren't |
| * on a signal frame though. |
| */ |
| mov eax, HEX(b8) |
| # else |
| mov eax, HEX(ad) |
| # endif |
| /* PR 254280: we assume int$80 is ok even for LOL64 */ |
| int HEX(80) |
| #endif |
| /* should not return. if we somehow do,infinite loop is intentional. |
| * FIXME: do better in release build! FIXME - why not an int3? */ |
| jmp GLOBAL_REF(unexpected_return) |
| END_FUNC(dynamorio_sigreturn) |
| |
| /* we need to exit without using any stack, to support |
| * THREAD_SYNCH_TERMINATED_AND_CLEANED. |
| * XXX: on MacOS this does use the stack. |
| * FIXME i#1403: on MacOS we fail to free the app's stack: we need to pass it to |
| * bsdthread_terminate. |
| */ |
| DECLARE_FUNC(dynamorio_sys_exit) |
| GLOBAL_LABEL(dynamorio_sys_exit:) |
| #ifdef MACOS |
| /* We need the mach port in order to invoke bsdthread_terminate */ |
| mov eax, MACH_thread_self_trap |
| # ifdef X64 |
| or eax, SYSCALL_NUM_MARKER_MACH |
| # else |
| neg eax |
| /* XXX: what about stack alignment? hard to control since we jumped here */ |
| # endif |
| /* see dynamorio_mach_syscall about why we do this call;pop and sysenter */ |
| call dynamorio_sys_exit_next |
| dynamorio_sys_exit_next: |
| pop REG_XDX |
| lea REG_XDX, [1/*pop*/ + 3/*lea*/ + 2/*sysenter*/ + 2/*mov*/ + REG_XDX] |
| mov REG_XCX, REG_XSP |
| sysenter |
| jae dynamorio_sys_exit_failed |
| # ifdef X64 |
| mov ARG4, 0 /* stack to free: NULL */ |
| mov ARG3, 0 /* stack free size: 0 */ |
| mov ARG2, REG_XAX /* kernel port, which we just acquired */ |
| mov ARG1, 0 /* join semaphore: SEMAPHORE_NULL */ |
| mov eax, SYS_bsdthread_terminate |
| or eax, HEX(2000000) /* 2<<24 for BSD syscall */ |
| mov r10, rcx |
| syscall |
| # else |
| lea REG_XSP, [-ARG_SZ + REG_XSP] /* maintain align-16: offset retaddr */ |
| push 0 /* stack to free: NULL */ |
| push 0 /* stack free size: 0 */ |
| push REG_XAX /* kernel port, which we just acquired */ |
| push 0 /* join semaphore: SEMAPHORE_NULL */ |
| push 0 /* retaddr slot */ |
| mov eax, SYS_bsdthread_terminate |
| int HEX(80) |
| # endif |
| #else /* LINUX: */ |
| # ifdef X64 |
| mov edi, 0 /* exit code: hardcoded */ |
| mov eax, SYS_exit |
| mov r10, rcx |
| syscall |
| # else |
| mov ebx, 0 /* exit code: hardcoded */ |
| mov eax, SYS_exit |
| /* PR 254280: we assume int$80 is ok even for LOL64 */ |
| int HEX(80) |
| # endif |
| #endif |
| /* should not return. if we somehow do, infinite loop is intentional. |
| * FIXME: do better in release build! FIXME - why not an int3? */ |
| dynamorio_sys_exit_failed: |
| jmp GLOBAL_REF(unexpected_return) |
| END_FUNC(dynamorio_sys_exit) |
| |
| #ifdef LINUX |
| /* we need to call futex_wakeall without using any stack, to support |
| * THREAD_SYNCH_TERMINATED_AND_CLEANED. |
| * takes int* futex in xax. |
| */ |
| DECLARE_FUNC(dynamorio_futex_wake_and_exit) |
| GLOBAL_LABEL(dynamorio_futex_wake_and_exit:) |
| #ifdef X64 |
| mov ARG6, 0 |
| mov ARG5, 0 |
| mov ARG4, 0 |
| mov ARG3, 0x7fffffff /* arg3 = INT_MAX */ |
| mov ARG2, 1 /* arg2 = FUTEX_WAKE */ |
| mov ARG1, rax /* &futex, passed in rax */ |
| mov rax, 202 /* SYS_futex */ |
| mov r10, rcx |
| syscall |
| #else |
| mov ebp, 0 /* arg6 */ |
| mov edi, 0 /* arg5 */ |
| mov esi, 0 /* arg4 */ |
| mov edx, 0x7fffffff /* arg3 = INT_MAX */ |
| mov ecx, 1 /* arg2 = FUTEX_WAKE */ |
| mov ebx, eax /* arg1 = &futex, passed in eax */ |
| mov eax, 240 /* SYS_futex */ |
| /* PR 254280: we assume int$80 is ok even for LOL64 */ |
| int HEX(80) |
| #endif |
| jmp GLOBAL_REF(dynamorio_sys_exit) |
| END_FUNC(dynamorio_futex_wake_and_exit) |
| #endif /* LINUX */ |
| |
| #ifdef MACOS |
| /* We need to call semaphore_signal_all without using dstack, to support |
| * THREAD_SYNCH_TERMINATED_AND_CLEANED. We have to put syscall args on |
| * the stack for 32-bit, and we use the stack for call;pop for |
| * sysenter -- so we use the app stack, which we assume the caller has |
| * put us on. We're only called when terminating a thread so transparency |
| * should be ok so long as the app's stack is valid. |
| * Takes KSYNCH_TYPE* in xax. |
| */ |
| DECLARE_FUNC(dynamorio_semaphore_signal_all) |
| GLOBAL_LABEL(dynamorio_semaphore_signal_all:) |
| mov REG_XAX, DWORD [REG_XAX] /* load mach_synch_t->sem */ |
| # ifdef X64 |
| mov ARG1, REG_XAX |
| mov eax, MACH_semaphore_signal_all_trap |
| or eax, SYSCALL_NUM_MARKER_MACH |
| # else |
| push REG_XAX |
| mov eax, MACH_semaphore_signal_all_trap |
| neg eax |
| /* args are on stack, w/ an extra slot (retaddr of syscall wrapper) */ |
| push 0 /* extra slot */ |
| /* XXX: what about stack alignment? hard to control since we jumped here */ |
| # endif |
| /* see dynamorio_mach_syscall about why we do this call;pop and sysenter */ |
| call dynamorio_semaphore_next |
| dynamorio_semaphore_next: |
| pop REG_XDX |
| lea REG_XDX, [1/*pop*/ + 3/*lea*/ + 2/*sysenter*/ + 2/*mov*/ + REG_XDX] |
| mov REG_XCX, REG_XSP |
| sysenter |
| # ifndef X64 |
| lea esp, [2*ARG_SZ + esp] /* must not change flags */ |
| # endif |
| /* we ignore return val */ |
| jmp GLOBAL_REF(dynamorio_sys_exit) |
| END_FUNC(dynamorio_semaphore_signal_all) |
| #endif /* MACOS */ |
| |
| /* exit entire group without using any stack, in case something like |
| * SYS_kill via cleanup_and_terminate fails. |
| * XXX: on 32-bit MacOS this does use the stack. |
| */ |
| DECLARE_FUNC(dynamorio_sys_exit_group) |
| GLOBAL_LABEL(dynamorio_sys_exit_group:) |
| #ifdef X64 |
| mov edi, 0 /* exit code: hardcoded */ |
| # ifdef MACOS |
| mov eax, SYS_exit |
| # else |
| mov eax, SYS_exit_group |
| # endif |
| mov r10, rcx |
| syscall |
| #else |
| # ifdef MACOS |
| lea REG_XSP, [-ARG_SZ + REG_XSP] /* maintain align-16: offset retaddr */ |
| push 0 /* exit code: hardcoded */ |
| push 0 /* retaddr slot */ |
| mov eax, SYS_exit |
| # else |
| mov ebx, 0 /* exit code: hardcoded */ |
| mov eax, SYS_exit_group |
| # endif |
| /* PR 254280: we assume int$80 is ok even for LOL64 */ |
| int HEX(80) |
| #endif |
| /* should not return. if we somehow do, infinite loop is intentional. |
| * FIXME: do better in release build! why not an int3? */ |
| jmp GLOBAL_REF(unexpected_return) |
| END_FUNC(dynamorio_sys_exit_group) |
| |
| #if defined(LINUX) && !defined(X64) |
| /* since our handler is rt, we have no source for the kernel's/libc's |
| * default non-rt sigreturn, so we set up our own. |
| */ |
| DECLARE_FUNC(dynamorio_nonrt_sigreturn) |
| GLOBAL_LABEL(dynamorio_nonrt_sigreturn:) |
| pop eax /* I don't understand why */ |
| mov eax, HEX(77) |
| /* PR 254280: we assume int$80 is ok even for LOL64 */ |
| int HEX(80) |
| /* should not return. if we somehow do,infinite loop is intentional. |
| * FIXME: do better in release build! FIXME - why not an int3? */ |
| jmp GLOBAL_REF(unexpected_return) |
| END_FUNC(dynamorio_nonrt_sigreturn) |
| #endif |
| |
| #ifdef HAVE_SIGALTSTACK |
| /* We used to get the SP by taking the address of our args, but that doesn't |
| * work on x64 nor with other compilers. Today we use asm to pass in the |
| * initial SP. For x64, we add a 4th register param and tail call to |
| * master_signal_handler_C. Adding a param and doing a tail call on ia32 is |
| * hard, so we make a real call and pass only xsp. The C routine uses it to |
| * read the original params. |
| * See also PR 305020. |
| */ |
| DECLARE_FUNC(master_signal_handler) |
| GLOBAL_LABEL(master_signal_handler:) |
| #ifdef X64 |
| # ifdef LINUX |
| mov ARG4, REG_XSP /* pass as extra arg */ |
| # else |
| mov ARG6, REG_XSP /* pass as extra arg */ |
| # endif |
| jmp GLOBAL_REF(master_signal_handler_C) |
| /* master_signal_handler_C will do the ret */ |
| #else |
| /* We need to pass in xsp. The easiest way is to create an |
| * intermediate frame. |
| */ |
| mov REG_XAX, REG_XSP |
| CALLC1_FRESH(GLOBAL_REF(master_signal_handler_C), REG_XAX) |
| # ifdef MACOS |
| mov eax, ARG5 /* ucxt */ |
| /* Set up args to SYS_sigreturn, skipping the retaddr slot */ |
| mov edx, ARG2 /* style */ |
| CALLC2_FRESH(GLOBAL_REF(dynamorio_sigreturn), eax, edx) |
| jmp GLOBAL_REF(unexpected_return) |
| # else |
| ret |
| # endif |
| #endif |
| END_FUNC(master_signal_handler) |
| |
| #else /* !HAVE_SIGALTSTACK */ |
| |
| /* PR 283149: if we're on the app stack now and we need to deliver |
| * immediately, we can't copy over our own sig frame w/ the app's, and we |
| * can't push the app's below ours and have continuation work. One choice |
| * is to copy the frame to pending and assume we'll deliver right away. |
| * Instead we always swap to dstack, which also makes us a little more |
| * transparent wrt running out of app stack or triggering app stack guard |
| * pages. We do it in asm since it's ugly to swap stacks in the middle |
| * of a C routine: have to fix up locals + frame ptr, or jmp to start of |
| * func and clobber callee-saved regs (which messes up vmkernel sigreturn). |
| */ |
| DECLARE_FUNC(master_signal_handler) |
| GLOBAL_LABEL(master_signal_handler:) |
| mov REG_XAX, ARG1 |
| mov REG_XCX, ARG2 |
| mov REG_XDX, ARG3 |
| /* save args */ |
| push REG_XAX |
| push REG_XCX |
| push REG_XDX |
| /* make space for answers: struct clone_and_swap_args */ |
| sub REG_XSP, CLONE_AND_SWAP_STRUCT_SIZE |
| mov REG_XAX, REG_XSP |
| /* call a C routine rather than writing everything in asm */ |
| CALLC2(GLOBAL_REF(sig_should_swap_stack), REG_XAX, REG_XDX) |
| cmp REG_XAX, 0 |
| pop REG_XAX /* clone_and_swap_args.stack */ |
| pop REG_XCX /* clone_and_swap_args.tos */ |
| je no_swap |
| /* calculate the offset between stacks */ |
| mov REG_XDX, REG_XAX |
| sub REG_XDX, REG_XCX /* shift = stack - tos */ |
| # ifdef VMX86_SERVER |
| /* update the two parameters to sigreturn for new stack |
| * we can eliminate this once we have PR 405694 |
| */ |
| # ifdef X64 |
| add r12, REG_XDX /* r12 += shift */ |
| # else |
| add REG_XSI, REG_XDX /* xsi += shift */ |
| # endif |
| add REG_XBP, REG_XDX /* xbp += shift */ |
| # endif |
| push REG_XDX |
| CALLC2(GLOBAL_REF(clone_and_swap_stack), REG_XAX, REG_XCX) |
| /* get shift back and update arg2 and arg3 */ |
| pop REG_XDX |
| pop REG_XCX /* arg3 */ |
| pop REG_XAX /* arg2 */ |
| add REG_XAX, REG_XDX /* arg2 += shift */ |
| add REG_XCX, REG_XDX /* arg3 += shift */ |
| # ifndef X64 |
| /* update the official arg2 and arg3 on the stack */ |
| mov [3*ARG_SZ + REG_XSP], REG_XAX /* skip arg1+retaddr+arg1 */ |
| mov [4*ARG_SZ + REG_XSP], REG_XCX |
| # endif |
| push REG_XAX |
| push REG_XCX |
| /* need to get arg1, old frame, new frame */ |
| mov REG_XAX, [4*ARG_SZ + REG_XSP] /* skip 3 args + retaddr */ |
| neg REG_XDX |
| add REG_XDX, REG_XSP /* xsp-shift = old frame */ |
| add REG_XDX, 3*ARG_SZ /* old frame */ |
| mov REG_XCX, REG_XSP |
| add REG_XCX, 3*ARG_SZ /* new frame */ |
| /* have to be careful about order of reg params */ |
| CALLC5(GLOBAL_REF(fixup_rtframe_pointers), 0, REG_XAX, REG_XDX, REG_XCX, 0) |
| no_swap: |
| # ifdef X64 |
| pop ARG3 |
| pop ARG2 |
| pop ARG1 |
| mov rcx, rsp /* pass as 4th arg */ |
| jmp GLOBAL_REF(master_signal_handler_C) |
| /* can't return, no retaddr */ |
| # else |
| add REG_XSP, 3*ARG_SZ |
| /* We need to pass in xsp. The easiest way is to create an |
| * intermediate frame. |
| */ |
| mov REG_XAX, REG_XSP |
| CALLC1(GLOBAL_REF(master_signal_handler_C), REG_XAX) |
| ret |
| # endif |
| END_FUNC(master_signal_handler) |
| #endif /* !HAVE_SIGALTSTACK */ |
| |
| #ifdef LINUX |
| /* SYS_clone swaps the stack so we need asm support to call it. |
| * signature: |
| * thread_id_t dynamorio_clone(uint flags, byte *newsp, void *ptid, void *tls, |
| * void *ctid, void (*func)(void)) |
| */ |
| DECLARE_FUNC(dynamorio_clone) |
| GLOBAL_LABEL(dynamorio_clone:) |
| /* save func for use post-syscall on the newsp. |
| * when using clone_record_t we have 4 slots we can clobber. |
| */ |
| # ifdef X64 |
| sub ARG2, ARG_SZ |
| mov [ARG2], ARG6 /* func is now on TOS of newsp */ |
| /* all args are already in syscall registers */ |
| mov r10, rcx |
| mov REG_XAX, SYS_clone |
| syscall |
| # else |
| mov REG_XAX, ARG6 |
| mov REG_XCX, ARG2 |
| sub REG_XCX, ARG_SZ |
| mov [REG_XCX], REG_XAX /* func is now on TOS of newsp */ |
| mov REG_XDX, ARG3 |
| /* preserve callee-saved regs */ |
| push REG_XBX |
| push REG_XSI |
| push REG_XDI |
| /* now can't use ARG* since xsp modified by pushes */ |
| mov REG_XBX, DWORD [4*ARG_SZ + REG_XSP] /* ARG1 + 3 pushes */ |
| mov REG_XSI, DWORD [7*ARG_SZ + REG_XSP] /* ARG4 + 3 pushes */ |
| mov REG_XDI, DWORD [8*ARG_SZ + REG_XSP] /* ARG5 + 3 pushes */ |
| mov REG_XAX, SYS_clone |
| /* PR 254280: we assume int$80 is ok even for LOL64 */ |
| int HEX(80) |
| # endif |
| cmp REG_XAX, 0 |
| jne dynamorio_clone_parent |
| /* avoid conflicts w/ parent's TLS by clearing our reg now */ |
| mov SEG_TLS, ax |
| pop REG_XCX |
| call REG_XCX |
| /* shouldn't return */ |
| jmp GLOBAL_REF(unexpected_return) |
| dynamorio_clone_parent: |
| # ifndef X64 |
| /* restore callee-saved regs */ |
| pop REG_XDI |
| pop REG_XSI |
| pop REG_XBX |
| # endif |
| /* return val is in eax still */ |
| ret |
| END_FUNC(dynamorio_clone) |
| #endif /* LINUX */ |
| |
| #endif /* UNIX */ |
| |
| |
| #ifdef MACOS |
| /* Thread interception at the user function. We need to get the |
| * stack pointer and to preserve callee-saved registers, as we will return |
| * back past the user function to the pthread layer (i#1403 covers |
| * intercepting earlier). We also clear fs, as the kernel seems to set it to |
| * point at a flat whole-address-space value, messing up our checks for |
| * it being initialized. |
| */ |
| DECLARE_FUNC(new_bsdthread_intercept) |
| GLOBAL_LABEL(new_bsdthread_intercept:) |
| /* We assume we can go ahead and clobber caller-saved regs. */ |
| mov eax, 0 |
| mov fs, eax |
| mov REG_XAX, ARG1 |
| PUSH_PRIV_MCXT(0 /* for priv_mcontext_t.pc */) |
| lea REG_XAX, [REG_XSP] /* stack grew down, so priv_mcontext_t at tos */ |
| CALLC1_FRESH(GLOBAL_REF(new_bsdthread_setup), REG_XAX) |
| /* should not return */ |
| jmp GLOBAL_REF(unexpected_return) |
| END_FUNC(new_bsdthread_intercept) |
| #endif |
| |
| |
| #ifdef WINDOWS |
| /* |
| * nt_continue_dynamo_start -- invoked to give dynamo control over |
| * exception handler continuation (after a call to NtContinue). |
| * identical to internal_dynamo_start except it calls nt_continue_start_setup |
| * to get the real next pc, and has an esp adjustment at the start. |
| */ |
| DECLARE_FUNC(nt_continue_dynamo_start) |
| GLOBAL_LABEL(nt_continue_dynamo_start:) |
| /* assume valid esp |
| * FIXME: this routine should really not assume esp */ |
| |
| /* grab exec state and pass as param in a priv_mcontext_t struct */ |
| PUSH_PRIV_MCXT(0 /* for priv_mcontext_t.pc */) |
| lea REG_XAX, [REG_XSP] /* stack grew down, so priv_mcontext_t at tos */ |
| |
| /* Call nt_continue_setup passing the priv_mcontext_t. It will |
| * obtain and initialize this thread's dcontext pointer and |
| * begin execution with the passed-in state. |
| */ |
| CALLC1(GLOBAL_REF(nt_continue_setup), REG_XAX) |
| /* should not return */ |
| jmp GLOBAL_REF(unexpected_return) |
| END_FUNC(nt_continue_dynamo_start) |
| #endif /* WINDOWS */ |
| |
| /* back_from_native_retstubs -- We use a different version of back_from_native for |
| * each nested module transition. This has to have MAX_NATIVE_RETSTACK |
| * elements, which we check in native_exec_init(). The size of each entry has |
| * to match BACK_FROM_NATIVE_RETSTUB_SIZE in arch_exports.h. Currently we |
| * assume that the assembler uses push imm8 and jmp rel8, but to get that |
| * to happen for nasm 0.98.40 we're forced to use raw bytes for the pushes. As in |
| * back_from_native, this code is executed natively by the app, so we assume the |
| * app stack is valid and can be clobbered. |
| */ |
| DECLARE_FUNC(back_from_native_retstubs) |
| GLOBAL_LABEL(back_from_native_retstubs:) |
| #ifndef ASSEMBLE_WITH_GAS |
| /* MASM does short jumps for public symbols. */ |
| # define Lback_from_native GLOBAL_REF(back_from_native) |
| #endif |
| RAW(6a) RAW(0) /* push 0 */ |
| jmp short Lback_from_native |
| RAW(6a) RAW(1) /* push 1 */ |
| jmp short Lback_from_native |
| RAW(6a) RAW(2) /* push 2 */ |
| jmp short Lback_from_native |
| RAW(6a) RAW(3) /* push 3 */ |
| jmp short Lback_from_native |
| RAW(6a) RAW(4) /* push 4 */ |
| jmp short Lback_from_native |
| RAW(6a) RAW(5) /* push 5 */ |
| jmp short Lback_from_native |
| RAW(6a) RAW(6) /* push 6 */ |
| jmp short Lback_from_native |
| RAW(6a) RAW(7) /* push 7 */ |
| jmp short Lback_from_native |
| RAW(6a) RAW(8) /* push 8 */ |
| jmp short Lback_from_native |
| RAW(6a) RAW(9) /* push 9 */ |
| jmp short Lback_from_native |
| DECLARE_GLOBAL(back_from_native_retstubs_end) |
| #ifndef ASSEMBLE_WITH_GAS |
| # undef Lback_from_native |
| #endif |
| ADDRTAKEN_LABEL(back_from_native_retstubs_end:) |
| END_FUNC(back_from_native_retstubs) |
| |
| /* |
| * back_from_native -- for taking control back after letting a module |
| * execute natively |
| * assumptions: app stack is valid |
| */ |
| DECLARE_FUNC(back_from_native) |
| GLOBAL_LABEL(back_from_native:) |
| #ifdef ASSEMBLE_WITH_GAS |
| /* We use Lback_from_native to force short jumps with gas. */ |
| Lback_from_native: |
| #endif |
| /* assume valid esp |
| * FIXME: more robust if don't use app's esp -- should use initstack |
| */ |
| /* grab exec state and pass as param in a priv_mcontext_t struct */ |
| PUSH_PRIV_MCXT(0 /* for priv_mcontext_t.pc */) |
| lea REG_XAX, [REG_XSP] /* stack grew down, so priv_mcontext_t at tos */ |
| |
| /* Call return_from_native passing the priv_mcontext_t. It will obtain |
| * this thread's dcontext pointer and begin execution with the passed-in |
| * state. |
| */ |
| #if defined(X64) || defined(MACOS) |
| and REG_XSP, -FRAME_ALIGNMENT /* x64 or Mac alignment */ |
| #endif |
| CALLC1(GLOBAL_REF(return_from_native), REG_XAX) |
| /* should not return */ |
| jmp GLOBAL_REF(unexpected_return) |
| END_FUNC(back_from_native) |
| |
| #ifdef UNIX |
| /* Like back_from_native, except we're calling from a native module into a |
| * module that should execute from the code cache. We transfer here from PLT |
| * stubs generated by create_plt_stub() in core/unix/native_elf.c. See also |
| * initialize_plt_stub_template(). On entry, next_pc is on the stack for ia32 |
| * and in %r11 for x64. We use %r11 because it is scratch in the sysv amd64 |
| * calling convention. |
| */ |
| DECLARE_FUNC(native_plt_call) |
| GLOBAL_LABEL(native_plt_call:) |
| PUSH_PRIV_MCXT(0 /* pc */) |
| lea REG_XAX, [REG_XSP] /* lea priv_mcontext_t */ |
| # ifdef X64 |
| mov REG_XCX, r11 /* next_pc in r11 */ |
| # else |
| mov REG_XCX, [REG_XSP + PRIV_MCXT_SIZE] /* next_pc on stack */ |
| add DWORD [REG_XAX + MCONTEXT_XSP_OFFS], ARG_SZ /* adjust app xsp for arg */ |
| # endif |
| CALLC2_FRESH(GLOBAL_REF(native_module_callout), REG_XAX, REG_XCX) |
| |
| /* If we returned, continue to execute natively on the app stack. */ |
| POP_PRIV_MCXT_GPRS() |
| # ifdef X64 |
| jmp r11 /* next_pc still in r11 */ |
| # else |
| ret /* next_pc was on stack */ |
| # endif |
| END_FUNC(native_plt_call) |
| #endif /* UNIX */ |
| |
| |
| /* Our version of setjmp & long jmp. We don't want to modify app state like |
| * SEH or do unwinding which is done by standard versions. |
| */ |
| |
| #ifdef CLIENT_INTERFACE |
| /* Front-end for client use where we don't want to expose our struct layouts, |
| * yet we must call dr_setjmp directly w/o a call frame in between for |
| * a proper restore point. |
| * |
| * int dr_try_start(try_except_context_t *cxt) ; |
| */ |
| # define TRY_CXT_SETJMP_OFFS 0 /* offsetof(try_except_context_t, context) */ |
| DECLARE_EXPORTED_FUNC(dr_try_start) |
| GLOBAL_LABEL(dr_try_start:) |
| add ARG1, TRY_CXT_SETJMP_OFFS |
| jmp GLOBAL_REF(dr_setjmp) |
| END_FUNC(dr_try_start) |
| #endif /* CLIENT_INTERFACE */ |
| |
| /* int cdecl dr_setjmp(dr_jmp_buf *buf); |
| */ |
| DECLARE_FUNC(dr_setjmp) |
| GLOBAL_LABEL(dr_setjmp:) |
| #ifdef UNIX |
| /* PR 206278: for try/except we need to save the signal mask */ |
| mov REG_XDX, ARG1 |
| push REG_XDX /* preserve */ |
| # if defined(MACOS) && !defined(X64) |
| lea REG_XSP, [-2*ARG_SZ + REG_XSP] /* maintain align-16: ra + push */ |
| # endif |
| CALLC1(GLOBAL_REF(dr_setjmp_sigmask), REG_XDX) |
| # if defined(MACOS) && !defined(X64) |
| lea REG_XSP, [2*ARG_SZ + REG_XSP] /* maintain align-16: ra + push */ |
| # endif |
| pop REG_XDX /* preserve */ |
| #else |
| mov REG_XDX, ARG1 |
| #endif |
| mov [ 0 + REG_XDX], REG_XBX |
| mov [ ARG_SZ + REG_XDX], REG_XCX |
| mov [2*ARG_SZ + REG_XDX], REG_XDI |
| mov [3*ARG_SZ + REG_XDX], REG_XSI |
| mov [4*ARG_SZ + REG_XDX], REG_XBP |
| mov [5*ARG_SZ + REG_XDX], REG_XSP |
| mov REG_XAX, [REG_XSP] |
| mov [6*ARG_SZ + REG_XDX], REG_XAX |
| #ifdef X64 |
| mov [ 7*ARG_SZ + REG_XDX], r8 |
| mov [ 8*ARG_SZ + REG_XDX], r9 |
| mov [ 9*ARG_SZ + REG_XDX], r10 |
| mov [10*ARG_SZ + REG_XDX], r11 |
| mov [11*ARG_SZ + REG_XDX], r12 |
| mov [12*ARG_SZ + REG_XDX], r13 |
| mov [13*ARG_SZ + REG_XDX], r14 |
| mov [14*ARG_SZ + REG_XDX], r15 |
| #endif |
| xor eax, eax |
| ret |
| END_FUNC(dr_setjmp) |
| |
| /* int cdecl dr_longjmp(dr_jmp_buf *buf, int retval); |
| */ |
| DECLARE_FUNC(dr_longjmp) |
| GLOBAL_LABEL(dr_longjmp:) |
| mov REG_XDX, ARG1 |
| mov REG_XAX, ARG2 |
| |
| mov REG_XBX, [ 0 + REG_XDX] |
| mov REG_XDI, [2*ARG_SZ + REG_XDX] |
| mov REG_XSI, [3*ARG_SZ + REG_XDX] |
| mov REG_XBP, [4*ARG_SZ + REG_XDX] |
| mov REG_XSP, [5*ARG_SZ + REG_XDX] /* now we've switched to the old stack */ |
| mov REG_XCX, [6*ARG_SZ + REG_XDX] |
| mov [REG_XSP], REG_XCX /* restore the return address on to the stack */ |
| mov REG_XCX, [ ARG_SZ + REG_XDX] |
| #ifdef X64 |
| mov r8, [ 7*ARG_SZ + REG_XDX] |
| mov r9, [ 8*ARG_SZ + REG_XDX] |
| mov r10, [ 9*ARG_SZ + REG_XDX] |
| mov r11, [10*ARG_SZ + REG_XDX] |
| mov r12, [11*ARG_SZ + REG_XDX] |
| mov r13, [12*ARG_SZ + REG_XDX] |
| mov r14, [13*ARG_SZ + REG_XDX] |
| mov r15, [14*ARG_SZ + REG_XDX] |
| #endif |
| ret |
| END_FUNC(dr_longjmp) |
| |
| |
| /*############################################################################# |
| *############################################################################# |
| * Utility routines moved here due to the lack of inline asm support |
| * in VC8. |
| */ |
| |
| /* uint atomic_swap(uint *addr, uint value) |
| * return current contents of addr and replace contents with value. |
| * on win32 could use InterlockedExchange intrinsic instead. |
| */ |
| DECLARE_FUNC(atomic_swap) |
| GLOBAL_LABEL(atomic_swap:) |
| mov REG_XAX, ARG2 |
| mov REG_XCX, ARG1 /* nop on win64 (ditto for linux64 if used rdi) */ |
| xchg [REG_XCX], eax |
| ret |
| END_FUNC(atomic_swap) |
| |
| /* bool cpuid_supported(void) |
| * Checks for existence of the cpuid instr by attempting to modify bit 21 of eflags |
| */ |
| DECLARE_FUNC(cpuid_supported) |
| GLOBAL_LABEL(cpuid_supported:) |
| PUSHF |
| pop REG_XAX |
| mov ecx, eax /* original eflags in ecx */ |
| xor eax, HEX(200000) /* try to modify bit 21 of eflags */ |
| push REG_XAX |
| POPF |
| PUSHF |
| pop REG_XAX |
| cmp ecx, eax |
| mov eax, 0 /* zero out top bytes */ |
| setne al |
| push REG_XCX /* now restore original eflags */ |
| POPF |
| ret |
| END_FUNC(cpuid_supported) |
| |
| /* void our_cpuid(int res[4], int eax) |
| * Executes cpuid instr, which is hard for x64 inline asm b/c clobbers rbx and can't |
| * push in middle of func. |
| */ |
| DECLARE_FUNC(our_cpuid) |
| GLOBAL_LABEL(our_cpuid:) |
| mov REG_XDX, ARG1 |
| mov REG_XAX, ARG2 |
| push REG_XBX /* callee-saved */ |
| push REG_XDI /* callee-saved */ |
| /* not making a call so don't bother w/ 16-byte stack alignment */ |
| mov REG_XDI, REG_XDX |
| cpuid |
| mov [ 0 + REG_XDI], eax |
| mov [ 4 + REG_XDI], ebx |
| mov [ 8 + REG_XDI], ecx |
| mov [12 + REG_XDI], edx |
| pop REG_XDI /* callee-saved */ |
| pop REG_XBX /* callee-saved */ |
| ret |
| END_FUNC(our_cpuid) |
| |
| /* We could use inline asm on Linux but it's cleaner to share the same code: */ |
| /* void dr_stmxcsr(uint *val) */ |
| #define FUNCNAME dr_stmxcsr |
| DECLARE_FUNC(FUNCNAME) |
| GLOBAL_LABEL(FUNCNAME:) |
| mov REG_XAX, ARG1 |
| stmxcsr [REG_XAX] |
| ret |
| END_FUNC(FUNCNAME) |
| #undef FUNCNAME |
| |
| /* void dr_xgetbv(uint *high, uint *low) */ |
| #define FUNCNAME dr_xgetbv |
| DECLARE_FUNC(FUNCNAME) |
| GLOBAL_LABEL(FUNCNAME:) |
| mov REG_XAX, ARG1 |
| mov REG_XDX, ARG2 |
| push REG_XAX /* high */ |
| push REG_XDX /* low */ |
| mov ecx, 0 |
| /* VS2005 assembler doesn't know xgetbv */ |
| RAW(0f) RAW(01) RAW(d0) /* xgetbv */ |
| pop REG_XCX |
| mov DWORD [REG_XCX], eax /* low */ |
| pop REG_XCX |
| mov DWORD [REG_XCX], edx /* high */ |
| ret |
| END_FUNC(FUNCNAME) |
| #undef FUNCNAME |
| |
| /* void dr_fxsave(byte *buf_aligned) */ |
| #define FUNCNAME dr_fxsave |
| DECLARE_FUNC(FUNCNAME) |
| GLOBAL_LABEL(FUNCNAME:) |
| mov REG_XAX, ARG1 |
| #ifdef X64 |
| /* VS2005 doesn't know "fxsave64" (and it's "fxsaveq" for gcc 4.4) */ |
| RAW(48) RAW(0f) RAW(ae) RAW(00) /* fxsave64 [REG_XAX] */ |
| #else |
| fxsave [REG_XAX] |
| #endif |
| fnclex |
| finit |
| ret |
| END_FUNC(FUNCNAME) |
| #undef FUNCNAME |
| |
| /* void dr_fnsave(byte *buf_aligned) */ |
| #define FUNCNAME dr_fnsave |
| DECLARE_FUNC(FUNCNAME) |
| GLOBAL_LABEL(FUNCNAME:) |
| mov REG_XAX, ARG1 |
| /* FIXME: do we need an fwait prior to the fnsave? */ |
| fnsave [REG_XAX] |
| fwait |
| ret |
| END_FUNC(FUNCNAME) |
| #undef FUNCNAME |
| |
| /* void dr_fxrstor(byte *buf_aligned) */ |
| #define FUNCNAME dr_fxrstor |
| DECLARE_FUNC(FUNCNAME) |
| GLOBAL_LABEL(FUNCNAME:) |
| mov REG_XAX, ARG1 |
| #ifdef X64 |
| /* VS2005 doesn't know "fxrstor64" */ |
| RAW(48) RAW(0f) RAW(ae) RAW(08) /* fxrstor64 [REG_XAX] */ |
| #else |
| fxrstor [REG_XAX] |
| #endif |
| ret |
| END_FUNC(FUNCNAME) |
| #undef FUNCNAME |
| |
| /* void dr_frstor(byte *buf_aligned) */ |
| #define FUNCNAME dr_frstor |
| DECLARE_FUNC(FUNCNAME) |
| GLOBAL_LABEL(FUNCNAME:) |
| mov REG_XAX, ARG1 |
| frstor [REG_XAX] |
| ret |
| END_FUNC(FUNCNAME) |
| #undef FUNCNAME |
| |
| #ifdef X64 |
| /* void dr_fxsave32(byte *buf_aligned) */ |
| #define FUNCNAME dr_fxsave32 |
| DECLARE_FUNC(FUNCNAME) |
| GLOBAL_LABEL(FUNCNAME:) |
| mov REG_XAX, ARG1 |
| fxsave [REG_XAX] |
| fnclex |
| finit |
| ret |
| END_FUNC(FUNCNAME) |
| #undef FUNCNAME |
| |
| /* void dr_fxrstor32(byte *buf_aligned) */ |
| #define FUNCNAME dr_fxrstor32 |
| DECLARE_FUNC(FUNCNAME) |
| GLOBAL_LABEL(FUNCNAME:) |
| mov REG_XAX, ARG1 |
| fxrstor [REG_XAX] |
| ret |
| END_FUNC(FUNCNAME) |
| #undef FUNCNAME |
| #endif |
| |
| #ifdef WINDOWS /* on linux we use inline asm versions */ |
| |
| /* byte *get_frame_ptr(void) |
| * returns the value of ebp |
| */ |
| DECLARE_FUNC(get_frame_ptr) |
| GLOBAL_LABEL(get_frame_ptr:) |
| mov REG_XAX, REG_XBP |
| ret |
| END_FUNC(get_frame_ptr) |
| |
| /* |
| * void call_modcode_alt_stack(dcontext_t *dcontext, |
| * EXCEPTION_RECORD *pExcptRec, |
| * CONTEXT *cxt, app_pc target, uint flags, |
| * bool using_initstack, fragment_t *f) |
| * custom routine used to transfer control from check_for_modified_code() |
| * to found_modified_code() win32/callback.c. |
| */ |
| #define dcontext ARG1 |
| #define pExcptRec ARG2 |
| #define cxt ARG3 |
| #define target ARG4 |
| #define flags ARG5 |
| #define using_initstack ARG6 |
| #define fragment ARG7 |
| DECLARE_FUNC(call_modcode_alt_stack) |
| GLOBAL_LABEL(call_modcode_alt_stack:) |
| mov REG_XAX, dcontext /* be careful not to clobber other in-reg params */ |
| mov REG_XBX, pExcptRec |
| mov REG_XDI, cxt |
| mov REG_XSI, target |
| mov REG_XDX, flags |
| mov REG_XCX, fragment |
| /* bool is byte-sized but rest should be zeroed as separate param */ |
| cmp using_initstack, 0 |
| je call_modcode_alt_stack_no_free |
| mov DWORD SYMREF(initstack_mutex), 0 /* rip-relative on x64 */ |
| call_modcode_alt_stack_no_free: |
| RESTORE_FROM_DCONTEXT_VIA_REG(REG_XAX,dstack_OFFSET,REG_XSP) |
| CALLC6(GLOBAL_REF(found_modified_code), REG_XAX, REG_XBX, REG_XDI, REG_XSI, REG_XDX, REG_XCX) |
| /* should never return */ |
| jmp GLOBAL_REF(unexpected_return) |
| ret |
| END_FUNC(call_modcode_alt_stack) |
| #undef dcontext |
| #undef pExcptRec |
| #undef cxt |
| #undef target |
| #undef flags |
| #undef using_initstack |
| |
| #ifdef STACK_GUARD_PAGE |
| /* |
| * void call_intr_excpt_alt_stack(dcontext_t *dcontext, EXCEPTION_RECORD *pExcptRec, |
| * CONTEXT *cxt, byte *stack) |
| * |
| * Routine to switch to a separate exception stack before calling |
| * internal_exception_info(). This switch is useful if the dstack |
| * is exhausted and we want to ensure we have enough space for |
| * error reporting. |
| */ |
| #define dcontext ARG1 |
| #define pExcptRec ARG2 |
| #define cxt ARG3 |
| #define stack ARG4 |
| DECLARE_FUNC(call_intr_excpt_alt_stack) |
| GLOBAL_LABEL(call_intr_excpt_alt_stack:) |
| mov REG_XAX, dcontext |
| mov REG_XBX, pExcptRec |
| mov REG_XDI, cxt |
| mov REG_XSI, REG_XSP |
| mov REG_XSP, stack |
| # ifdef X64 |
| /* retaddr + this push => 16-byte alignment prior to call */ |
| # endif |
| push REG_XSI /* save xsp */ |
| CALLC4(GLOBAL_REF(internal_exception_info), \ |
| REG_XAX /* dcontext */, \ |
| REG_XBX /* pExcptRec */, \ |
| REG_XDI /* cxt */, \ |
| 1 /* dstack overflow == true */) |
| pop REG_XSP |
| ret |
| END_FUNC(call_intr_excpt_alt_stack) |
| #undef dcontext |
| #undef pExcptRec |
| #undef cxt |
| #undef stack |
| #endif /* STACK_GUARD_PAGE */ |
| |
| /* CONTEXT.Seg* is WORD for x64 but DWORD for x86 */ |
| #ifdef X64 |
| # define REG_XAX_SEGWIDTH ax |
| #else |
| # define REG_XAX_SEGWIDTH eax |
| #endif |
| /* Need a second volatile register for any calling convention. In all |
| * conventions, XCX is volatile, but it's ARG4 on Lin64 and ARG1 on Win64. |
| * Using XCX on Win64 is fine, but on Lin64 it clobbers ARG4 so we use XDI as |
| * the free reg instead. |
| */ |
| #if defined(UNIX) && defined(X64) |
| # define FREE_REG rdi |
| #else |
| # define FREE_REG REG_XCX |
| #endif |
| /* void get_segments_defg(cxt_seg_t *ds, cxt_seg_t *es, cxt_seg_t *fs, cxt_seg_t *gs) */ |
| DECLARE_FUNC(get_segments_defg) |
| GLOBAL_LABEL(get_segments_defg:) |
| xor eax, eax /* Zero XAX, use it for reading segments. */ |
| mov FREE_REG, ARG1 |
| mov ax, ds |
| mov [FREE_REG], REG_XAX_SEGWIDTH |
| mov FREE_REG, ARG2 |
| mov ax, es |
| mov [FREE_REG], REG_XAX_SEGWIDTH |
| mov FREE_REG, ARG3 |
| mov ax, fs |
| mov [FREE_REG], REG_XAX_SEGWIDTH |
| mov FREE_REG, ARG4 |
| mov ax, gs |
| mov [FREE_REG], REG_XAX_SEGWIDTH |
| ret |
| END_FUNC(get_segments_defg) |
| |
| /* void get_segments_cs_ss(cxt_seg_t *cs, cxt_seg_t *ss) */ |
| DECLARE_FUNC(get_segments_cs_ss) |
| GLOBAL_LABEL(get_segments_cs_ss:) |
| xor eax, eax /* Zero XAX, use it for reading segments. */ |
| mov FREE_REG, ARG1 |
| mov ax, cs |
| mov [FREE_REG], REG_XAX_SEGWIDTH |
| mov FREE_REG, ARG2 |
| mov ax, ss |
| mov [FREE_REG], REG_XAX_SEGWIDTH |
| ret |
| END_FUNC(get_segments_cs_ss) |
| #undef FREE_REG |
| #undef REG_XAX_SEGWIDTH |
| |
| /* void get_own_context_helper(CONTEXT *cxt) |
| * does not fix up xsp to match the call site |
| * does not preserve callee-saved registers |
| */ |
| DECLARE_FUNC(get_own_context_helper) |
| GLOBAL_LABEL(get_own_context_helper:) |
| /* push callee-saved registers that we use */ |
| push REG_XBX |
| push REG_XSI |
| push REG_XDI |
| #ifdef X64 |
| /* w/ retaddr, we're now at 16-byte alignment */ |
| /* save argument register (PUSH_PRIV_MCXT calls out to c code) */ |
| mov REG_XDI, ARG1 |
| #endif |
| |
| /* grab exec state and pass as param in a priv_mcontext_t struct */ |
| /* use retaddr for pc */ |
| PUSH_PRIV_MCXT([(3 * ARG_SZ) + REG_XSP - PUSH_PRIV_MCXT_PRE_PC_SHIFT]) |
| /* we don't have enough registers to avoid parameter regs so we carefully |
| * use the suggested register order |
| */ |
| lea REG_XSI, [REG_XSP] /* stack grew down, so priv_mcontext_t at tos */ |
| #ifdef X64 |
| mov REG_XAX, REG_XDI |
| #else |
| /* 4 * arg_sz = 3 callee saved registers pushed to stack plus return addr */ |
| mov REG_XAX, [PRIV_MCXT_SIZE + (4 * ARG_SZ) + REG_XSP] |
| #endif |
| xor edi, edi |
| mov di, ss |
| xor ebx, ebx |
| mov bx, cs |
| CALLC4(GLOBAL_REF(get_own_context_integer_control), REG_XAX, REG_XBX, REG_XDI, REG_XSI) |
| add REG_XSP, PRIV_MCXT_SIZE |
| pop REG_XDI |
| pop REG_XSI |
| pop REG_XBX |
| ret |
| END_FUNC(get_own_context_helper) |
| |
| #endif /* WINDOWS */ |
| |
| /* void get_xmm_caller_saved(byte *xmm_caller_saved_buf) |
| * stores the values of xmm0 through xmm5 consecutively into xmm_caller_saved_buf. |
| * xmm_caller_saved_buf need not be 16-byte aligned. |
| * for linux, also saves xmm6-15 (PR 302107). |
| * caller must ensure that the underlying processor supports SSE! |
| * FIXME PR 266305: AMD optimization guide says to use movlps+movhps for unaligned |
| * stores, instead of movups (movups is best for loads): but for |
| * simplicity I'm sticking with movups (assumed not perf-critical here). |
| */ |
| DECLARE_FUNC(get_xmm_caller_saved) |
| GLOBAL_LABEL(get_xmm_caller_saved:) |
| mov REG_XAX, ARG1 |
| movups [REG_XAX + 0*XMM_SAVED_REG_SIZE], xmm0 |
| movups [REG_XAX + 1*XMM_SAVED_REG_SIZE], xmm1 |
| movups [REG_XAX + 2*XMM_SAVED_REG_SIZE], xmm2 |
| movups [REG_XAX + 3*XMM_SAVED_REG_SIZE], xmm3 |
| movups [REG_XAX + 4*XMM_SAVED_REG_SIZE], xmm4 |
| movups [REG_XAX + 5*XMM_SAVED_REG_SIZE], xmm5 |
| #ifdef UNIX |
| movups [REG_XAX + 6*XMM_SAVED_REG_SIZE], xmm6 |
| movups [REG_XAX + 7*XMM_SAVED_REG_SIZE], xmm7 |
| #endif |
| #if defined(UNIX) && defined(X64) |
| movups [REG_XAX + 8*XMM_SAVED_REG_SIZE], xmm8 |
| movups [REG_XAX + 9*XMM_SAVED_REG_SIZE], xmm9 |
| movups [REG_XAX + 10*XMM_SAVED_REG_SIZE], xmm10 |
| movups [REG_XAX + 11*XMM_SAVED_REG_SIZE], xmm11 |
| movups [REG_XAX + 12*XMM_SAVED_REG_SIZE], xmm12 |
| movups [REG_XAX + 13*XMM_SAVED_REG_SIZE], xmm13 |
| movups [REG_XAX + 14*XMM_SAVED_REG_SIZE], xmm14 |
| movups [REG_XAX + 15*XMM_SAVED_REG_SIZE], xmm15 |
| #endif |
| ret |
| END_FUNC(get_xmm_caller_saved) |
| |
| /* void get_ymm_caller_saved(byte *ymm_caller_saved_buf) |
| * stores the values of ymm0 through ymm5 consecutively into ymm_caller_saved_buf. |
| * ymm_caller_saved_buf need not be 32-byte aligned. |
| * for linux, also saves ymm6-15 (PR 302107). |
| * caller must ensure that the underlying processor supports SSE! |
| */ |
| DECLARE_FUNC(get_ymm_caller_saved) |
| GLOBAL_LABEL(get_ymm_caller_saved:) |
| mov REG_XAX, ARG1 |
| /* i#441: some compilers like gcc 4.3 and VS2005 do not know "vmovdqu". |
| * We just put in the raw bytes for these instrs: |
| * Note the 64/32 bit have the same encoding for either rax or eax. |
| * c5 fe 7f 00 vmovdqu %ymm0,0x00(%xax) |
| * c5 fe 7f 48 20 vmovdqu %ymm1,0x20(%xax) |
| * c5 fe 7f 50 40 vmovdqu %ymm2,0x40(%xax) |
| * c5 fe 7f 58 60 vmovdqu %ymm3,0x60(%xax) |
| * c5 fe 7f a0 80 00 00 00 vmovdqu %ymm4,0x80(%xax) |
| * c5 fe 7f a8 a0 00 00 00 vmovdqu %ymm5,0xa0(%xax) |
| */ |
| RAW(c5) RAW(fe) RAW(7f) RAW(00) |
| RAW(c5) RAW(fe) RAW(7f) RAW(48) RAW(20) |
| RAW(c5) RAW(fe) RAW(7f) RAW(50) RAW(40) |
| RAW(c5) RAW(fe) RAW(7f) RAW(58) RAW(60) |
| RAW(c5) RAW(fe) RAW(7f) RAW(a0) RAW(80) RAW(00) RAW(00) RAW(00) |
| RAW(c5) RAW(fe) RAW(7f) RAW(a8) RAW(a0) RAW(00) RAW(00) RAW(00) |
| #ifdef UNIX |
| /* |
| * c5 fe 7f b0 c0 00 00 00 vmovdqu %ymm6,0xc0(%xax) |
| * c5 fe 7f b8 e0 00 00 00 vmovdqu %ymm7,0xe0(%xax) |
| */ |
| RAW(c5) RAW(fe) RAW(7f) RAW(b0) RAW(c0) RAW(00) RAW(00) RAW(00) |
| RAW(c5) RAW(fe) RAW(7f) RAW(b8) RAW(e0) RAW(00) RAW(00) RAW(00) |
| # ifdef X64 |
| /* |
| * c5 7e 7f 80 00 01 00 00 vmovdqu %ymm8, 0x100(%xax) |
| * c5 7e 7f 88 20 01 00 00 vmovdqu %ymm9, 0x120(%xax) |
| * c5 7e 7f 90 40 01 00 00 vmovdqu %ymm10,0x140(%xax) |
| * c5 7e 7f 98 60 01 00 00 vmovdqu %ymm11,0x160(%xax) |
| * c5 7e 7f a0 80 01 00 00 vmovdqu %ymm12,0x180(%xax) |
| * c5 7e 7f a8 a0 01 00 00 vmovdqu %ymm13,0x1a0(%xax) |
| * c5 7e 7f b0 c0 01 00 00 vmovdqu %ymm14,0x1c0(%xax) |
| * c5 7e 7f b8 e0 01 00 00 vmovdqu %ymm15,0x1e0(%xax) |
| */ |
| RAW(c5) RAW(7e) RAW(7f) RAW(80) RAW(00) RAW(01) RAW(00) RAW(00) |
| RAW(c5) RAW(7e) RAW(7f) RAW(88) RAW(20) RAW(01) RAW(00) RAW(00) |
| RAW(c5) RAW(7e) RAW(7f) RAW(90) RAW(40) RAW(01) RAW(00) RAW(00) |
| RAW(c5) RAW(7e) RAW(7f) RAW(98) RAW(60) RAW(01) RAW(00) RAW(00) |
| RAW(c5) RAW(7e) RAW(7f) RAW(a0) RAW(80) RAW(01) RAW(00) RAW(00) |
| RAW(c5) RAW(7e) RAW(7f) RAW(a8) RAW(a0) RAW(01) RAW(00) RAW(00) |
| RAW(c5) RAW(7e) RAW(7f) RAW(b0) RAW(c0) RAW(01) RAW(00) RAW(00) |
| RAW(c5) RAW(7e) RAW(7f) RAW(b8) RAW(e0) RAW(01) RAW(00) RAW(00) |
| # endif |
| #endif |
| ret |
| END_FUNC(get_ymm_caller_saved) |
| |
| /* void hashlookup_null_handler(void) |
| * PR 305731: if the app targets NULL, it ends up here, which indirects |
| * through hashlookup_null_target to end up in an ibl miss routine. |
| */ |
| DECLARE_FUNC(hashlookup_null_handler) |
| GLOBAL_LABEL(hashlookup_null_handler:) |
| #if !defined(X64) && defined(LINUX) |
| /* We don't have any free registers to make this PIC so we patch |
| * this up. It would be better to generate than patch .text, |
| * but we need a static address to reference in null_fragment |
| * (though if we used shared ibl target_delete we could |
| * set our final address prior to using null_fragment anywhere). |
| */ |
| jmp hashlookup_null_handler |
| #else |
| jmp PTRSZ SYMREF(hashlookup_null_target) /* rip-relative on x64 */ |
| #endif |
| END_FUNC(hashlookup_null_handler) |
| |
| #ifdef X64 |
| # define PTRSZ_SHIFT_BITS 3 |
| # define PTRSZ_SUFFIXED(string_op) string_op##q |
| # ifdef UNIX |
| # define ARGS_TO_XDI_XSI_XDX() /* ABI handles this. */ |
| # define RESTORE_XDI_XSI() /* Not needed. */ |
| # else /* WINDOWS */ |
| /* Get args 1, 2, 3 into rdi, rsi, and rdx. */ |
| # define ARGS_TO_XDI_XSI_XDX() \ |
| push rdi @N@\ |
| push rsi @N@\ |
| mov rdi, ARG1 @N@\ |
| mov rsi, ARG2 @N@\ |
| mov rdx, ARG3 |
| # define RESTORE_XDI_XSI() \ |
| pop rsi @N@\ |
| pop rdi |
| # endif /* WINDOWS */ |
| #else |
| # define PTRSZ_SHIFT_BITS 2 |
| # define PTRSZ_SUFFIXED(string_op) string_op##d |
| |
| /* Get args 1, 2, 3 into edi, esi, and edx to match Linux x64 ABI. Need to save |
| * edi and esi since they are callee-saved. The ARGN macros can't handle |
| * stack adjustments, so use the scratch regs eax and ecx to hold the args |
| * before the pushes. |
| */ |
| # define ARGS_TO_XDI_XSI_XDX() \ |
| mov eax, ARG1 @N@\ |
| mov ecx, ARG2 @N@\ |
| mov edx, ARG3 @N@\ |
| push edi @N@\ |
| push esi @N@\ |
| mov edi, eax @N@\ |
| mov esi, ecx |
| # define RESTORE_XDI_XSI() \ |
| pop esi @N@\ |
| pop edi |
| #endif |
| |
| /* Repeats string_op for XDX bytes using aligned pointer-sized operations when |
| * possible. Assumes that string_op works by counting down until XCX reaches |
| * zero. The pointer-sized string ops are aligned based on ptr_to_align. |
| * For string ops that have both a src and dst, aligning based on src is |
| * preferred, subject to micro-architectural differences. |
| * |
| * XXX: glibc memcpy uses SSE instructions to copy, which is 10% faster on x64 |
| * and ~2x faster for 20kb copies on plain x86. Using SSE is quite complicated, |
| * because it means doing cpuid checks and loop unrolling. Many of our string |
| * operations are short anyway. For safe_read, it also increases the number of |
| * potentially faulting PCs. |
| */ |
| #define REP_STRING_OP(funcname, ptr_to_align, string_op) \ |
| mov REG_XCX, ptr_to_align @N@\ |
| and REG_XCX, (ARG_SZ - 1) @N@\ |
| jz funcname##_aligned @N@\ |
| neg REG_XCX @N@\ |
| add REG_XCX, ARG_SZ @N@\ |
| cmp REG_XDX, REG_XCX /* if (n < xcx) */ @N@\ |
| cmovb REG_XCX, REG_XDX /* xcx = n; */ @N@\ |
| sub REG_XDX, REG_XCX @N@\ |
| ADDRTAKEN_LABEL(funcname##_pre:) @N@\ |
| rep string_op##b @N@\ |
| funcname##_aligned: @N@\ |
| /* Aligned word-size ops. */ @N@\ |
| mov REG_XCX, REG_XDX @N@\ |
| shr REG_XCX, PTRSZ_SHIFT_BITS @N@\ |
| ADDRTAKEN_LABEL(funcname##_mid:) @N@\ |
| rep PTRSZ_SUFFIXED(string_op) @N@\ |
| /* Handle trailing bytes. */ @N@\ |
| mov REG_XCX, REG_XDX @N@\ |
| and REG_XCX, (ARG_SZ - 1) @N@\ |
| ADDRTAKEN_LABEL(funcname##_post:) @N@\ |
| rep string_op##b |
| |
| /* Declare these labels global so we can take their addresses in C. pre, mid, |
| * and post are defined by REP_STRING_OP(). |
| */ |
| DECLARE_GLOBAL(safe_read_asm_pre) |
| DECLARE_GLOBAL(safe_read_asm_mid) |
| DECLARE_GLOBAL(safe_read_asm_post) |
| DECLARE_GLOBAL(safe_read_asm_recover) |
| |
| /* i#350: We implement safe_read in assembly and save the PCs that can fault. |
| * If these PCs fault, we return from the signal handler to the epilog, which |
| * can recover. We return the source pointer from XSI, and the caller uses this |
| * to determine how many bytes were copied and whether it matches size. |
| * |
| * XXX: Do we care about differentiating whether the read or write faulted? |
| * Currently this is just "safe_memcpy", and we recover regardless of whether |
| * the read or write faulted. |
| * |
| * void * |
| * safe_read_asm(void *dst, const void *src, size_t n); |
| */ |
| DECLARE_FUNC(safe_read_asm) |
| GLOBAL_LABEL(safe_read_asm:) |
| ARGS_TO_XDI_XSI_XDX() /* dst=xdi, src=xsi, n=xdx */ |
| /* Copy xdx bytes, align on src. */ |
| REP_STRING_OP(safe_read_asm, REG_XSI, movs) |
| ADDRTAKEN_LABEL(safe_read_asm_recover:) |
| mov REG_XAX, REG_XSI /* Return cur_src */ |
| RESTORE_XDI_XSI() |
| ret |
| END_FUNC(safe_read_asm) |
| |
| #ifdef UNIX |
| /* i#46: Implement private memcpy and memset for libc isolation. If we import |
| * memcpy and memset from libc in the normal way, the application can override |
| * those definitions and intercept them. In particular, this occurs when |
| * running an app that links in the Address Sanitizer runtime. Since we already |
| * need a reasonably efficient assembly memcpy implementation for safe_read, we |
| * go ahead and reuse the code for private memcpy and memset. |
| * |
| * XXX: See comment on REP_STRING_OP about maybe using SSE instrs. It's more |
| * viable for memcpy and memset than for safe_read_asm. |
| */ |
| |
| /* Private memcpy. |
| */ |
| DECLARE_FUNC(memcpy) |
| GLOBAL_LABEL(memcpy:) |
| ARGS_TO_XDI_XSI_XDX() /* dst=xdi, src=xsi, n=xdx */ |
| mov REG_XAX, REG_XDI /* Save dst for return. */ |
| /* Copy xdx bytes, align on src. */ |
| REP_STRING_OP(memcpy, REG_XSI, movs) |
| RESTORE_XDI_XSI() |
| ret /* Return original dst. */ |
| END_FUNC(memcpy) |
| |
| /* Private memset. |
| */ |
| DECLARE_FUNC(memset) |
| GLOBAL_LABEL(memset:) |
| ARGS_TO_XDI_XSI_XDX() /* dst=xdi, val=xsi, n=xdx */ |
| push REG_XDI /* Save dst for return. */ |
| test esi, esi /* Usually val is zero. */ |
| jnz make_val_word_size |
| xor eax, eax |
| do_memset: |
| /* Set xdx bytes, align on dst. */ |
| REP_STRING_OP(memset, REG_XDI, stos) |
| pop REG_XAX /* Return original dst. */ |
| RESTORE_XDI_XSI() |
| ret |
| |
| /* Create pointer-sized value in XAX using multiply. */ |
| make_val_word_size: |
| and esi, HEX(ff) |
| # ifdef X64 |
| mov rax, HEX(0101010101010101) |
| # else |
| mov eax, HEX(01010101) |
| # endif |
| /* Use two-operand imul to avoid clobbering XDX. */ |
| imul REG_XAX, REG_XSI |
| jmp do_memset |
| END_FUNC(memset) |
| |
| |
| # ifndef MACOS /* XXX: attribute alias issue, plus using nasm */ |
| /* gcc emits calls to these *_chk variants in release builds when the size of |
| * dst is known at compile time. In C, the caller is responsible for cleaning |
| * up arguments on the stack, so we alias these *_chk routines to the non-chk |
| * routines and rely on the caller to clean up the extra dst_len arg. |
| */ |
| .global __memcpy_chk |
| .hidden __memcpy_chk |
| .set __memcpy_chk,memcpy |
| |
| .global __memset_chk |
| .hidden __memset_chk |
| .set __memset_chk,memset |
| # endif |
| |
| |
| /* Replacement for _dl_runtime_resolve() used for catching module transitions |
| * out of native modules. |
| */ |
| DECLARE_FUNC(_dynamorio_runtime_resolve) |
| GLOBAL_LABEL(_dynamorio_runtime_resolve:) |
| # ifdef X64 |
| /* Preserve all 6 argument registers and rax (num fp reg args). */ |
| push rax |
| push rdi |
| push rsi |
| push rdx |
| push rcx |
| push r8 |
| push r9 |
| |
| /* Should be 16-byte aligned now: retaddr, 2 args, 7 regs. */ |
| mov rdi, [rsp + 7 * ARG_SZ] /* link map */ |
| mov rsi, [rsp + 8 * ARG_SZ] /* .dynamic index */ |
| CALLC0(GLOBAL_REF(dynamorio_dl_fixup)) |
| mov r11, rax /* preserve */ |
| |
| pop r9 |
| pop r8 |
| pop rcx |
| pop rdx |
| pop rsi |
| pop rdi |
| pop rax |
| |
| add rsp, 16 /* clear args */ |
| jmp r11 /* Jump to resolved PC, or into DR. */ |
| # else /* !X64 */ |
| push REG_XAX |
| push REG_XCX |
| mov REG_XAX, [REG_XSP + 2 * ARG_SZ] /* link map */ |
| mov REG_XCX, [REG_XSP + 3 * ARG_SZ] /* .dynamic index */ |
| # ifdef MACOS |
| lea REG_XSP, [-1*ARG_SZ + REG_XSP] /* maintain align-16: ra + push x2 */ |
| # endif |
| CALLC2(GLOBAL_REF(dynamorio_dl_fixup), REG_XAX, REG_XCX) |
| # ifdef MACOS |
| lea REG_XSP, [1*ARG_SZ + REG_XSP] /* maintain align-16: ra + push x2 */ |
| # endif |
| mov [REG_XSP + 2 * ARG_SZ], REG_XAX /* overwrite arg1 */ |
| pop REG_XCX |
| pop REG_XAX |
| ret 4 /* ret to target, pop arg2 */ |
| # endif /* !X64 */ |
| END_FUNC(_dynamorio_runtime_resolve) |
| |
| #endif /* UNIX */ |
| |
| /*############################################################################# |
| *############################################################################# |
| */ |
| |
| /****************************************************************************/ |
| /****************************************************************************/ |
| #endif /* !NOT_DYNAMORIO_CORE_PROPER */ |
| |
| /**************************************************************************** |
| * routines shared with NOT_DYNAMORIO_CORE_PROPER |
| */ |
| |
| /* void dr_fpu_exception_init(void) |
| * sets the exception mask flags for both regular float and xmm packed float |
| */ |
| #define FUNCNAME dr_fpu_exception_init |
| DECLARE_FUNC(FUNCNAME) |
| GLOBAL_LABEL(FUNCNAME:) |
| fninit |
| push HEX(1f80) |
| ldmxcsr DWORD [REG_XSP] |
| pop REG_XAX |
| ret |
| END_FUNC(FUNCNAME) |
| #undef FUNCNAME |
| |
| /* void get_mmx_val(OUT uint64 *val, uint index) |
| * Returns the value of mmx register #index in val. |
| */ |
| #define FUNCNAME get_mmx_val |
| DECLARE_FUNC_SEH(FUNCNAME) |
| GLOBAL_LABEL(FUNCNAME:) |
| mov REG_XAX, ARG1 |
| mov REG_XCX, ARG2 |
| END_PROLOG |
| cmp ecx, 0 |
| je get_mmx_0 |
| cmp ecx, 1 |
| je get_mmx_1 |
| cmp ecx, 2 |
| je get_mmx_2 |
| cmp ecx, 3 |
| je get_mmx_3 |
| cmp ecx, 4 |
| je get_mmx_4 |
| cmp ecx, 5 |
| je get_mmx_5 |
| cmp ecx, 6 |
| je get_mmx_6 |
| movq QWORD [REG_XAX], mm7 |
| jmp get_mmx_done |
| get_mmx_6: |
| movq QWORD [REG_XAX], mm6 |
| jmp get_mmx_done |
| get_mmx_5: |
| movq QWORD [REG_XAX], mm5 |
| jmp get_mmx_done |
| get_mmx_4: |
| movq QWORD [REG_XAX], mm4 |
| jmp get_mmx_done |
| get_mmx_3: |
| movq QWORD [REG_XAX], mm3 |
| jmp get_mmx_done |
| get_mmx_2: |
| movq QWORD [REG_XAX], mm2 |
| jmp get_mmx_done |
| get_mmx_1: |
| movq QWORD [REG_XAX], mm1 |
| jmp get_mmx_done |
| get_mmx_0: |
| movq QWORD [REG_XAX], mm0 |
| get_mmx_done: |
| add REG_XSP, 0 /* make a legal SEH64 epilog */ |
| ret |
| END_FUNC(FUNCNAME) |
| #undef FUNCNAME |
| |
| #ifdef WINDOWS |
| |
| /* byte *get_stack_ptr(void) |
| * returns the value of xsp before the call |
| */ |
| DECLARE_FUNC(get_stack_ptr) |
| GLOBAL_LABEL(get_stack_ptr:) |
| mov REG_XAX, REG_XSP |
| add REG_XAX, ARG_SZ /* remove return address space */ |
| ret |
| END_FUNC(get_stack_ptr) |
| |
| /* void load_dynamo(void) |
| * |
| * used for injection into a child process |
| * N.B.: if the code here grows, SIZE_OF_LOAD_DYNAMO in win32/inject.c |
| * must be updated. |
| */ |
| DECLARE_FUNC(load_dynamo) |
| GLOBAL_LABEL(load_dynamo:) |
| /* the code for this routine is copied into an allocation in the app |
| and invoked upon return from the injector. When it is invoked, |
| it expects the app's stack to look like this: |
| |
| xsp-->| &LoadLibrary | for x64 xsp must be 16-aligned |
| | &dynamo_path | |
| | &GetProcAddr | |
| | &dynamo_entry |___ |
| | | | |
| |(saved context)| priv_mcontext_t struct |
| | &code_alloc | | pointer to the code allocation |
| | sizeof(code_alloc)| size of the code allocation |
| |_______________|___| (possible padding for x64 xsp alignment) |
| &dynamo_path-->| | | |
| | (dynamo path) | TEXT(DYNAMORIO_DLL_PATH) |
| |_______________|___| |
| &dynamo_entry-->| | | |
| | (dynamo entry)| "dynamo_auto_start" |
| | |___| |
| |
| |
| in separate allocation ___ |
| | | | |
| | CODE | load_dynamo() code |
| | |___| |
| |
| The load_dynamo routine will load the dynamo DLL into memory, then jump |
| to its dynamo_auto_start entry point, passing it the saved app context as |
| parameters. |
| */ |
| /* two byte NOP to satisfy third party braindead-ness documented in case 3821 */ |
| mov edi, edi |
| #ifdef LOAD_DYNAMO_DEBUGBREAK |
| /* having this code in front may hide the problem addressed with the |
| * above padding */ |
| /* giant loop so can attach debugger, then change ebx to 1 |
| * to step through rest of code */ |
| mov ebx, HEX(7fffffff) |
| load_dynamo_repeat_outer: |
| mov eax, HEX(7fffffff) |
| load_dynamo_repeatme: |
| dec eax |
| cmp eax, 0 |
| jg load_dynamo_repeatme |
| dec ebx |
| cmp ebx, 0 |
| jg load_dynamo_repeat_outer |
| |
| # ifdef X64 |
| /* xsp is 8-aligned and our pop makes it 16-aligned */ |
| # endif |
| /* TOS has &DebugBreak */ |
| pop REG_XBX /* pop REG_XBX = &DebugBreak */ |
| CALLWIN0(REG_XBX) /* call DebugBreak (in kernel32.lib) */ |
| #endif |
| /* TOS has &LoadLibraryA */ |
| pop REG_XBX /* pop REG_XBX = &LoadLibraryA */ |
| /* TOS has &dynamo_path */ |
| pop REG_XAX /* for 32-bit we're doing "pop eax, push eax" */ |
| CALLWIN1(REG_XBX, REG_XAX) /* call LoadLibraryA (in kernel32.lib) */ |
| |
| /* check result */ |
| cmp REG_XAX, 0 |
| jne load_dynamo_success |
| pop REG_XBX /* pop off &GetProcAddress */ |
| pop REG_XBX /* pop off &dynamo_entry */ |
| jmp load_dynamo_failure |
| load_dynamo_success: |
| /* TOS has &GetProcAddress */ |
| pop REG_XBX /* pop REG_XBX = &GetProcAddress */ |
| /* dynamo_handle is now in REG_XAX (returned by call LoadLibrary) */ |
| /* TOS has &dynamo_entry */ |
| pop REG_XDI /* for 32-bit we're doing "pop edi, push edi" */ |
| CALLWIN2(REG_XBX, REG_XAX, REG_XDI) /* call GetProcAddress */ |
| cmp REG_XAX, 0 |
| je load_dynamo_failure |
| |
| /* jump to dynamo_auto_start (returned by GetProcAddress) */ |
| jmp REG_XAX |
| /* dynamo_auto_start will take over or continue natively at the saved |
| * context via load_dynamo_failure. |
| */ |
| END_FUNC(load_dynamo) |
| /* N.B.: load_dynamo_failure MUST follow load_dynamo, as both are |
| * copied in one fell swoop by inject_into_thread()! |
| */ |
| /* not really a function but having issues getting both masm and gas to |
| * let other asm routines jump here. |
| * targeted by load_dynamo and dynamo_auto_start by a jump, not a call, |
| * when we should not take over and should go native instead. |
| * Xref case 7654: we come here to the child's copy from dynamo_auto_start |
| * instead of returning to the parent's copy post-load_dynamo to avoid |
| * incompatibilites with stack layout accross dr versions. |
| */ |
| DECLARE_FUNC(load_dynamo_failure) |
| GLOBAL_LABEL(load_dynamo_failure:) |
| /* Would be nice if we could free our allocation here as well, but |
| * that's too much of a pain (esp. here). |
| * Note TOS has the saved context at this point, xref layout in |
| * auto_setup. Note this code is duplicated in dynamo_auto_start. */ |
| mov REG_XAX, [MCONTEXT_XSP_OFFS + REG_XSP] /* load app xsp */ |
| mov REG_XBX, [MCONTEXT_PC_OFFS + REG_XSP] /* load app start_pc */ |
| /* write app start_pc off top of app stack */ |
| mov [-ARG_SZ + REG_XAX], REG_XBX |
| /* it's ok to write past app TOS since we're just overwriting part of |
| * the dynamo_entry string which is dead at this point, won't affect |
| * the popping of the saved context */ |
| POPGPR |
| POPF |
| /* we assume reading beyond TOS is ok here (no signals on windows) */ |
| /* we assume xmm0-5 do not need to be restored */ |
| /* restore app xsp (POPGPR doesn't) */ |
| mov REG_XSP, [-MCONTEXT_PC_OFFS + MCONTEXT_XSP_OFFS + REG_XSP] |
| jmp PTRSZ [-ARG_SZ + REG_XSP] /* jmp to app start_pc */ |
| |
| ret |
| END_FUNC(load_dynamo_failure) |
| |
| |
| /***************************************************************************/ |
| #ifndef X64 |
| |
| /* Routines to switch to 64-bit mode from 32-bit WOW64, make a 64-bit |
| * call, and then return to 32-bit mode. |
| */ |
| |
| /* FIXME: check these selector values on all platforms: these are for XPSP2. |
| * Keep in synch w/ defines in arch.h. |
| */ |
| # define CS32_SELECTOR HEX(23) |
| # define CS64_SELECTOR HEX(33) |
| |
| /* |
| * int switch_modes_and_load(void *ntdll64_LdrLoadDll, |
| * UNICODE_STRING_64 *lib, |
| * HANDLE *result) |
| */ |
| # define FUNCNAME switch_modes_and_load |
| DECLARE_FUNC(FUNCNAME) |
| GLOBAL_LABEL(FUNCNAME:) |
| /* get args before we change esp */ |
| mov eax, ARG1 |
| mov ecx, ARG2 |
| mov edx, ARG3 |
| /* save callee-saved registers */ |
| push ebx |
| /* far jmp to next instr w/ 64-bit switch: jmp 0033:<sml_transfer_to_64> */ |
| RAW(ea) |
| DD offset sml_transfer_to_64 |
| DB CS64_SELECTOR |
| RAW(00) |
| sml_transfer_to_64: |
| /* Below here is executed in 64-bit mode, but with guarantees that |
| * no address is above 4GB, as this is a WOW64 process. |
| */ |
| /* Call LdrLoadDll to load 64-bit lib: |
| * LdrLoadDll(IN PWSTR DllPath OPTIONAL, |
| * IN PULONG DllCharacteristics OPTIONAL, |
| * IN PUNICODE_STRING DllName, |
| * OUT PVOID *DllHandle)); |
| */ |
| RAW(4c) RAW(8b) RAW(ca) /* mov r9, rdx : 4th arg: result */ |
| RAW(4c) RAW(8b) RAW(c1) /* mov r8, rcx : 3rd arg: lib */ |
| push 0 /* slot for &DllCharacteristics */ |
| lea edx, dword ptr [esp] /* 2nd arg: &DllCharacteristics */ |
| xor ecx, ecx /* 1st arg: DllPath = NULL */ |
| /* save WOW64 state */ |
| RAW(41) push esp /* push r12 */ |
| RAW(41) push ebp /* push r13 */ |
| RAW(41) push esi /* push r14 */ |
| RAW(41) push edi /* push r15 */ |
| /* align the stack pointer */ |
| mov ebx, esp /* save esp in callee-preserved reg */ |
| sub esp, 32 /* call conv */ |
| and esp, HEX(fffffff0) /* align to 16-byte boundary */ |
| call eax |
| mov esp, ebx /* restore esp */ |
| /* restore WOW64 state */ |
| RAW(41) pop edi /* pop r15 */ |
| RAW(41) pop esi /* pop r14 */ |
| RAW(41) pop ebp /* pop r13 */ |
| RAW(41) pop esp /* pop r12 */ |
| /* far jmp to next instr w/ 32-bit switch: jmp 0023:<sml_return_to_32> */ |
| push offset sml_return_to_32 /* 8-byte push */ |
| mov dword ptr [esp + 4], CS32_SELECTOR /* top 4 bytes of prev push */ |
| jmp fword ptr [esp] |
| sml_return_to_32: |
| add esp, 16 /* clean up far jmp target and &DllCharacteristics */ |
| pop ebx /* restore callee-saved reg */ |
| ret /* return value already in eax */ |
| END_FUNC(FUNCNAME) |
| |
| /* |
| * int switch_modes_and_call(void_func_t func, void *arg1, void *arg2, void *arg3) |
| */ |
| # undef FUNCNAME |
| # define FUNCNAME switch_modes_and_call |
| DECLARE_FUNC(FUNCNAME) |
| GLOBAL_LABEL(FUNCNAME:) |
| mov eax, ARG1 |
| mov ecx, ARG2 |
| mov edx, ARG3 |
| /* save callee-saved registers */ |
| push ebx |
| mov ebx, ARG4 |
| /* far jmp to next instr w/ 64-bit switch: jmp 0033:<smc_transfer_to_64> */ |
| RAW(ea) |
| DD offset smc_transfer_to_64 |
| DB CS64_SELECTOR |
| RAW(00) |
| smc_transfer_to_64: |
| /* Below here is executed in 64-bit mode, but with guarantees that |
| * no address is above 4GB, as this is a WOW64 process. |
| */ |
| /* save WOW64 state */ |
| RAW(41) push esp /* push r12 */ |
| RAW(41) push ebp /* push r13 */ |
| RAW(41) push esi /* push r14 */ |
| RAW(41) push edi /* push r15 */ |
| RAW(44) mov eax, ebx /* mov ARG4 in ebx to r8d (3rd arg slot) */ |
| /* align the stack pointer */ |
| mov ebx, esp /* save esp in callee-preserved reg */ |
| sub esp, 32 /* call conv */ |
| and esp, HEX(fffffff0) /* align to 16-byte boundary */ |
| call eax /* arg1 is already in rcx and arg2 in rdx */ |
| mov esp, ebx /* restore esp */ |
| /* restore WOW64 state */ |
| RAW(41) pop edi /* pop r15 */ |
| RAW(41) pop esi /* pop r14 */ |
| RAW(41) pop ebp /* pop r13 */ |
| RAW(41) pop esp /* pop r12 */ |
| /* far jmp to next instr w/ 32-bit switch: jmp 0023:<smc_return_to_32> */ |
| push offset smc_return_to_32 /* 8-byte push */ |
| mov dword ptr [esp + 4], CS32_SELECTOR /* top 4 bytes of prev push */ |
| jmp fword ptr [esp] |
| smc_return_to_32: |
| add esp, 8 /* clean up far jmp target */ |
| pop ebx /* restore callee-saved reg */ |
| ret /* return value already in eax */ |
| END_FUNC(FUNCNAME) |
| |
| /* |
| * DR_API ptr_int_t |
| * dr_invoke_x64_routine(dr_auxlib64_routine_ptr_t func64, uint num_params, ...) |
| */ |
| # undef FUNCNAME |
| # define FUNCNAME dr_invoke_x64_routine |
| DECLARE_EXPORTED_FUNC(FUNCNAME) |
| GLOBAL_LABEL(FUNCNAME:) |
| /* This is 32-bit so we just need the stack ptr to locate all the args */ |
| mov eax, esp |
| /* save callee-saved registers */ |
| push ebx |
| /* far jmp to next instr w/ 64-bit switch: jmp 0033:<inv64_transfer_to_64> */ |
| RAW(ea) |
| DD offset inv64_transfer_to_64 |
| DB CS64_SELECTOR |
| RAW(00) |
| inv64_transfer_to_64: |
| /* Below here is executed in 64-bit mode, but with guarantees that |
| * no address is above 4GB, as this is a WOW64 process. |
| */ |
| /* Save WOW64 state. |
| * FIXME: if the x64 code makes any callbacks, not only do we need |
| * a wrapper to go back to x86 mode but we need to restore these |
| * values in case the x86 callback invokes any syscalls! |
| * Really messy and fragile. |
| */ |
| RAW(41) push esp /* push r12 */ |
| RAW(41) push ebp /* push r13 */ |
| RAW(41) push esi /* push r14 */ |
| RAW(41) push edi /* push r15 */ |
| /* align the stack pointer */ |
| mov ebx, esp /* save esp in callee-preserved reg */ |
| sub esp, 32 /* call conv */ |
| mov ecx, dword ptr [12 + eax] /* #args (func64 takes two slots) */ |
| sub ecx, 4 |
| jle inv64_arg_copy_done |
| shl ecx, 3 /* (#args-4)*8 */ |
| sub esp, ecx /* slots for args */ |
| and esp, HEX(fffffff0) /* align to 16-byte boundary */ |
| /* copy the args to their stack slots (simpler to copy the 1st 4 too) */ |
| mov ecx, dword ptr [12 + eax] /* #args */ |
| cmp ecx, 0 |
| je inv64_arg_copy_done |
| inv64_arg_copy_loop: |
| mov edx, dword ptr [12 + 4*ecx + eax] /* ecx = 1-based arg ordinal */ |
| /* FIXME: sign-extension is not always what the user wants. |
| * But the only general way to solve it would be to take in type codes |
| * for each arg! |
| */ |
| RAW(48) RAW(63) RAW(d2) /* movsxd rdx, edx (sign-extend) */ |
| RAW(48) /* qword ptr */ |
| mov dword ptr [-8 + 8*ecx + esp], edx |
| sub ecx, 1 /* we can't use "dec" as it will be encoded wrong! */ |
| jnz inv64_arg_copy_loop |
| inv64_arg_copy_done: |
| /* put the 1st 4 args into their reg slots */ |
| mov ecx, dword ptr [12 + eax] /* #args */ |
| cmp ecx, 4 |
| jl inv64_arg_lt4 |
| mov edx, dword ptr [12 + 4*4 + eax] /* 1-based arg ordinal */ |
| RAW(4c) RAW(63) RAW(ca) /* movsxd r9, edx */ |
| inv64_arg_lt4: |
| cmp ecx, 3 |
| jl inv64_arg_lt3 |
| mov edx, dword ptr [12 + 4*3 + eax] /* 1-based arg ordinal */ |
| RAW(4c) RAW(63) RAW(c2) /* movsxd r8, edx */ |
| inv64_arg_lt3: |
| cmp ecx, 2 |
| jl inv64_arg_lt2 |
| mov edx, dword ptr [12 + 4*2 + eax] /* 1-based arg ordinal */ |
| RAW(48) RAW(63) RAW(d2) /* movsxd rdx, edx (sign-extend) */ |
| inv64_arg_lt2: |
| cmp ecx, 1 |
| jl inv64_arg_lt1 |
| mov ecx, dword ptr [12 + 4*1 + eax] /* 1-based arg ordinal */ |
| RAW(48) RAW(63) RAW(c9) /* movsxd rcx, ecx (sign-extend) */ |
| inv64_arg_lt1: |
| /* make the call */ |
| RAW(48) /* qword ptr */ |
| mov eax, dword ptr [4 + eax] /* func64 */ |
| RAW(48) call eax |
| /* get top 32 bits of return value into edx for 64-bit x86 return value */ |
| RAW(48) mov edx, eax |
| RAW(48) shr edx, 32 |
| mov esp, ebx /* restore esp */ |
| /* restore WOW64 state */ |
| RAW(41) pop edi /* pop r15 */ |
| RAW(41) pop esi /* pop r14 */ |
| RAW(41) pop ebp /* pop r13 */ |
| RAW(41) pop esp /* pop r12 */ |
| /* far jmp to next instr w/ 32-bit switch: jmp 0023:<inv64_return_to_32> */ |
| push offset inv64_return_to_32 /* 8-byte push */ |
| mov dword ptr [esp + 4], CS32_SELECTOR /* top 4 bytes of prev push */ |
| jmp fword ptr [esp] |
| inv64_return_to_32: |
| add esp, 8 /* clean up far jmp target */ |
| pop ebx /* restore callee-saved reg */ |
| ret /* return value in edx:eax */ |
| END_FUNC(FUNCNAME) |
| |
| #endif /* !X64 */ |
| /***************************************************************************/ |
| |
| |
| # ifndef NOT_DYNAMORIO_CORE_PROPER |
| /* void dynamorio_earliest_init_takeover(void) |
| * |
| * Called from hook code for earliest injection. |
| * Since we want to resume at the hooked app code as though nothing |
| * happened w/o going first to hooking code to restore regs, caller |
| * passed us args pointed at by xax. We then preserve regs and call |
| * C code. C code takes over when it returns to use. We restore |
| * regs and return to app code. |
| * Executes on app stack but we assume app stack is fine at this point. |
| */ |
| DECLARE_EXPORTED_FUNC(dynamorio_earliest_init_takeover) |
| GLOBAL_LABEL(dynamorio_earliest_init_takeover:) |
| PUSHGPR |
| # ifdef EARLIEST_INIT_DEBUGBREAK |
| /* giant loop so can attach debugger, then change ebx to 1 |
| * to step through rest of code */ |
| mov ebx, HEX(7fffffff) |
| dynamorio_earliest_init_repeat_outer: |
| mov esi, HEX(7fffffff) |
| dynamorio_earliest_init_repeatme: |
| dec esi |
| cmp esi, 0 |
| jg dynamorio_earliest_init_repeatme |
| dec ebx |
| cmp ebx, 0 |
| jg dynamorio_earliest_init_repeat_outer |
| # endif |
| /* args are pointed at by xax */ |
| CALLC1(GLOBAL_REF(dynamorio_earliest_init_takeover_C), REG_XAX) |
| /* we will either be under DR control or running natively at this point */ |
| |
| /* restore */ |
| POPGPR |
| ret |
| END_FUNC(dynamorio_earliest_init_takeover) |
| # endif /* NOT_DYNAMORIO_CORE_PROPER */ |
| |
| #endif /* WINDOWS */ |
| |
| END_FILE |