blob: 12afc0c49db14d7ca161cdcfde0da0ee1eb9ddfe [file] [log] [blame]
/* **********************************************************
* Copyright (c) 2011-2014 Google, Inc. All rights reserved.
* Copyright (c) 2000-2010 VMware, Inc. All rights reserved.
* **********************************************************/
/*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of VMware, Inc. nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
/* Copyright (c) 2003-2007 Determina Corp. */
/* Copyright (c) 2001-2003 Massachusetts Institute of Technology */
/* Copyright (c) 2000-2001 Hewlett-Packard Company */
/* file "arch_exports.h" -- arch-specific exported declarations
*
* References:
* "Intel Architecture Software Developer's Manual", 1999.
* "ARM Architecture Manual", 2014.
*/
#ifndef _ARCH_EXPORTS_H_
#define _ARCH_EXPORTS_H_ 1
/* stack slot width */
#define XSP_SZ (sizeof(reg_t))
#ifdef X86
/* PR 264138: we must preserve xmm0-5 if on a 64-bit kernel.
* On Linux we must preserve all xmm registers.
* If AVX is enabled we save ymm.
* i#437: YMM is an extension of XMM from 128-bit to 256-bit without
* adding new ones, so code operating on XMM often also operates on YMM,
* and thus some *XMM* macros also apply to *YMM*.
*/
# define XMM_REG_SIZE 16
# define YMM_REG_SIZE 32
# define XMM_SAVED_REG_SIZE YMM_REG_SIZE /* space in priv_mcontext_t for xmm/ymm */
# define XMM_SLOTS_SIZE (NUM_XMM_SLOTS*XMM_SAVED_REG_SIZE)
# define XMM_SAVED_SIZE (NUM_XMM_SAVED*XMM_SAVED_REG_SIZE)
/* Indicates OS support, not just processor support (xref i#1278) */
# define YMM_ENABLED() (proc_avx_enabled())
# define YMMH_REG_SIZE (YMM_REG_SIZE/2) /* upper half */
# define YMMH_SAVED_SIZE (NUM_XMM_SLOTS*YMMH_REG_SIZE)
#endif /* X86 */
/* Number of slots for spills from inlined clean calls. */
#define CLEANCALL_NUM_INLINE_SLOTS 5
typedef enum {
IBL_NONE = -1,
/* N.B.: order determines which table is on 2nd cache line in local_state_t */
IBL_RETURN = 0, /* returns lookup routine has stricter requirements */
IBL_BRANCH_TYPE_START = IBL_RETURN,
IBL_INDCALL,
IBL_INDJMP,
IBL_GENERIC = IBL_INDJMP, /* currently least restrictive */
/* can double if a generic lookup is needed
FIXME: remove this and add names for specific needs */
IBL_SHARED_SYSCALL = IBL_GENERIC,
IBL_BRANCH_TYPE_END
} ibl_branch_type_t;
#define IBL_HASH_FUNC_OFFSET_MAX IF_X64_ELSE(4,3)
struct _fragment_entry_t; /* in fragment.h */
struct _ibl_table_t; /* in fragment.h */
/* Scratch space and state required to be easily accessible from
* in-cache indirect branch lookup routines, store in thread-local storage.
* Goal is to get it all in one cache line: currently, though, it is
* 2 lines on 32-byte-line machines, with the call* and jmp* tables and the
* hashtable stats spilling onto the 2nd line.
* Even all on one line, shared ibl has a load vs private ibl's hardcoded immed...
*
* FIXME: to avoid splitting the mcontext for now these scratch
* fs:slots are used in fcache, but copied to the
* mcontext on transitions. see case 3701
*/
typedef struct _lookup_table_access_t {
ptr_uint_t hash_mask;
struct _fragment_entry_t *lookuptable;
} lookup_table_access_t;
typedef struct _table_stat_state_t {
/* Organized in mask-table pairs to get both fields for a particular table
* on the same cache line.
*/
/* FIXME We can play w/ordering these fields differently or if TLS space is
* crunched keeping a subset of them in TLS.
* For example, the ret_trace & indcall_trace tables could be heavily used
* but if the indjmp table isn't, it might make sense to put the ret_bb
* table's fields into TLS since ret_bb is likely to be the most heavily
* used for BB2BB IBL.
*/
lookup_table_access_t table[IBL_BRANCH_TYPE_END];
/* FIXME: should allocate this separately so that release and
* DEBUG builds have the same layout especially when backward
* aligned entry */
#ifdef HASHTABLE_STATISTICS
uint stats;
#endif
} table_stat_state_t;
/* FIXME i#1551: implement the spill state for ARM */
/* All spill slots are grouped in a separate struct because with
* -no_ibl_table_in_tls, only these slots are mapped to TLS (and the
* table address/mask pairs are not).
*/
typedef struct _spill_state_t {
/* Four registers are used in the indirect branch lookup routines */
#ifdef X86
reg_t xax, xbx, xcx, xdx; /* general-purpose registers */
#elif defined (ARM)
reg_t r0, r1, r2, r3;
#endif
/* FIXME: move this below the tables to fit more on cache line */
dcontext_t *dcontext;
} spill_state_t;
typedef struct _local_state_t {
spill_state_t spill_space;
} local_state_t;
typedef struct _local_state_extended_t {
spill_state_t spill_space;
table_stat_state_t table_space;
} local_state_extended_t;
/* local_state_[extended_]t is allocated in os-specific thread-local storage (TLS),
* accessible off of fs:. But, the actual segment offset varies, so
* os_tls_offset() must be used to obtain an fs: offset from a slot.
*/
#ifdef X86
# define TLS_XAX_SLOT ((ushort)offsetof(spill_state_t, xax))
# define TLS_XBX_SLOT ((ushort)offsetof(spill_state_t, xbx))
# define TLS_XCX_SLOT ((ushort)offsetof(spill_state_t, xcx))
# define TLS_XDX_SLOT ((ushort)offsetof(spill_state_t, xdx))
# define TLS_SLOT_REG0 TLS_XAX_SLOT
# define TLS_SLOT_REG1 TLS_XBX_SLOT
# define TLS_SLOT_REG2 TLS_XCX_SLOT
# define TLS_SLOT_REG3 TLS_XDX_SLOT
# define SCRATCH_REG0 DR_REG_XAX
# define SCRATCH_REG1 DR_REG_XBX
# define SCRATCH_REG2 DR_REG_XCX
# define SCRATCH_REG3 DR_REG_XDX
#elif defined(ARM)
# define TLS_SLOT_REG0 ((ushort)offsetof(spill_state_t, r0))
# define TLS_SLOT_REG1 ((ushort)offsetof(spill_state_t, r1))
# define TLS_SLOT_REG2 ((ushort)offsetof(spill_state_t, r2))
# define TLS_SLOT_REG3 ((ushort)offsetof(spill_state_t, r3))
# define SCRATCH_REG0 DR_REG_R0
# define SCRATCH_REG1 DR_REG_R1
# define SCRATCH_REG2 DR_REG_R2
# define SCRATCH_REG3 DR_REG_R3
#endif /* X86/ARM */
#define TLS_DCONTEXT_SLOT ((ushort)offsetof(spill_state_t, dcontext))
#define TABLE_OFFSET (offsetof(local_state_extended_t, table_space))
#define TLS_MASK_SLOT(btype) ((ushort)(TABLE_OFFSET \
+ offsetof(table_stat_state_t, table[btype]) \
+ offsetof(lookup_table_access_t, hash_mask)))
#define TLS_TABLE_SLOT(btype) ((ushort)(TABLE_OFFSET \
+ offsetof(table_stat_state_t, table[btype]) \
+ offsetof(lookup_table_access_t, lookuptable)))
#ifdef HASHTABLE_STATISTICS
# define TLS_HTABLE_STATS_SLOT ((ushort)(offsetof(local_state_extended_t, \
table_space) \
+ offsetof(table_stat_state_t, stats)))
#endif
#define TLS_NUM_SLOTS \
(DYNAMO_OPTION(ibl_table_in_tls) ? \
sizeof(local_state_extended_t) / sizeof(void *) : \
sizeof(local_state_t) / sizeof(void *))
#ifdef WINDOWS
# define DETACH_CALLBACK_CODE_SIZE 256
# define DETACH_CALLBACK_FINAL_JMP_SIZE 32
/* For detach - stores callback continuation pcs and is used to dispatch to them after
* we detach. We have one per a thread (with stacked callbacks) stored in an array. */
typedef struct _detach_callback_stack_t {
thread_id_t tid; /* thread tid */
ptr_uint_t count; /* number of saved post-syscall continuation pcs */
app_pc *callback_addrs; /* location of array of saved continuation pcs */
reg_t xax_save; /* spill slot for post-syscall code */
reg_t xbx_save; /* spill slot for post-syscall code */
reg_t xcx_save; /* spill slot for post-syscall code */
app_pc target; /* temp slot for post-syscall code */
/* we need some private code to do the actual jmp */
byte code_buf[DETACH_CALLBACK_FINAL_JMP_SIZE];
} detach_callback_stack_t;
void
arch_patch_syscall(dcontext_t *dcontext, byte *target);
byte *
emit_detach_callback_code(dcontext_t *dcontext, byte *buf,
detach_callback_stack_t *callback_state);
void
emit_detach_callback_final_jmp(dcontext_t *dcontext,
detach_callback_stack_t *callback_state);
#endif
/* We use this to ensure that linking and unlinking is atomic with respect
* to a thread in the cache, this is needed for our current flushing
* implementation. Note that linking and unlinking are only atomic with
* respect to a thread in the cache not with respect to a thread in dynamorio
* (which can see linking flags etc.)
*/
/* see bug 524 for additional notes, reproduced here :
* there is no way to do a locked mov, have to use an xchg or similar which is
* a larger performance penalty (not really an issue), note that xchg implies
* lock, so no need for the lock prefix below
*
* further Intel's documentation is a little weird on the issue of
* cross-modifying code (see IA32 volume 3 7-2 through 7-7), "Locked
* instructions should not be used to insure that data written can be fetched
* as instructions" and "Locked operations are atomic with respect to all other
* memory operations and all externally visible events. Only instruction fetch
* and page table access can pass locked instructions", (pass?) however it does
* note that the current versions of P6 family, pentium 4, xeon, pentium and
* 486 allow data written by locked instructions to be fetched as instructions.
* In the cross-modifying code section, however, it gives a (horrible for us)
* algorithm to ensure cross-modifying code is compliant with current and
* future versions of IA-32 then says that "the use of this option is not
* required for programs intended to run on the 486, but is recommended to
* insure compatibility with pentium 4, xeon, P6 family and pentium
* processors", so my take home is that it works now, but don't have any
* expectations for the future - FIXME - */
/* Ref case 3628, case 4397, empirically this only works for code where the
* entire offset being written is within a cache line, so we can't use a locked
* instruction to ensure atomicity */
#define PAD_JMPS_ALIGNMENT \
(INTERNAL_OPTION(pad_jmps_set_alignment) != 0 ? \
INTERNAL_OPTION(pad_jmps_set_alignment) : proc_get_cache_line_size())
#ifdef DEBUG
# define CHECK_JMP_TARGET_ALIGNMENT(target, size, hot_patch) do { \
if (hot_patch && CROSSES_ALIGNMENT(target, size, PAD_JMPS_ALIGNMENT)) { \
STATS_INC(unaligned_patches); \
ASSERT(!DYNAMO_OPTION(pad_jmps)); \
} \
} while (0)
#else
# define CHECK_JMP_TARGET_ALIGNMENT(target, size, hot_patch)
#endif
#ifdef WINDOWS
/* note that the microsoft compiler will not enregister variables across asm
* blocks that touch those registers, so don't need to worry about clobbering
* eax and ebx */
# define ATOMIC_4BYTE_WRITE(target, value, hot_patch) do { \
ASSERT(sizeof(value) == 4); \
/* test that we aren't crossing a cache line boundary */ \
CHECK_JMP_TARGET_ALIGNMENT(target, 4, hot_patch); \
/* we use xchgl instead of mov for non-4-byte-aligned writes */ \
_InterlockedExchange((volatile LONG *)target, (LONG)value); \
} while (0)
# ifdef X64
# define ATOMIC_8BYTE_WRITE(target, value, hot_patch) do { \
ASSERT(sizeof(value) == 8); \
/* Not currently used to write code */ \
ASSERT_CURIOSITY(!hot_patch); \
/* test that we aren't crossing a cache line boundary */ \
CHECK_JMP_TARGET_ALIGNMENT(target, 8, hot_patch); \
/* we use xchgl instead of mov for non-4-byte-aligned writes */ \
_InterlockedExchange64((volatile __int64 *)target, (__int64)value); \
} while (0)
# endif
/* We use intrinsics since they eliminated inline asm support for x64.
* FIXME: these intrinsics all use xadd even when no return value is needed!
* We assume these aren't performance-critical enough to care.
* If we do change to not have return value, need to change static inlines below.
* Even if these turn into callouts, they should be reentrant.
*/
# define ATOMIC_INC_int(var) _InterlockedIncrement((volatile LONG *)&(var))
# ifdef X64 /* 64-bit intrinsics only avail on x64 */
# define ATOMIC_INC_int64(var) _InterlockedIncrement64((volatile __int64 *)&(var))
# endif
# define ATOMIC_INC(type, var) ATOMIC_INC_##type(var)
# define ATOMIC_DEC_int(var) _InterlockedDecrement((volatile LONG *)&(var))
# ifdef X64 /* 64-bit intrinsics only avail on x64 */
# define ATOMIC_DEC_int64(var) _InterlockedDecrement64((volatile __int64 *)&(var))
# endif
# define ATOMIC_DEC(type, var) ATOMIC_DEC_##type(var)
/* Note that there is no x86/x64 _InterlockedAdd: only for IA64 */
# define ATOMIC_ADD_int(var, value) \
_InterlockedExchangeAdd((volatile LONG *)&(var), value)
# ifdef X64 /* 64-bit intrinsics only avail on x64 */
# define ATOMIC_ADD_int64(var, value) \
_InterlockedExchangeAdd64((volatile __int64 *)&(var), value)
# endif
# define ATOMIC_COMPARE_EXCHANGE_int(var, compare, exchange) \
_InterlockedCompareExchange((volatile LONG *)&(var), exchange, compare)
# define ATOMIC_COMPARE_EXCHANGE_int64(var, compare, exchange) \
_InterlockedCompareExchange64((volatile __int64 *)&(var), exchange, compare)
# define ATOMIC_COMPARE_EXCHANGE ATOMIC_COMPARE_EXCHANGE_int
# define ATOMIC_ADD(type, var, val) ATOMIC_ADD_##type(var, val)
# ifdef X64
# define ATOMIC_ADD_PTR(type, var, val) ATOMIC_ADD_int64(var, val)
# define ATOMIC_COMPARE_EXCHANGE_PTR ATOMIC_COMPARE_EXCHANGE_int64
# else
# define ATOMIC_ADD_PTR(type, var, val) ATOMIC_ADD_int(var, val)
# define ATOMIC_COMPARE_EXCHANGE_PTR ATOMIC_COMPARE_EXCHANGE_int
# endif
# define SPINLOCK_PAUSE() _mm_pause() /* PAUSE = 0xf3 0x90 = repz nop */
# define RDTSC_LL(var) (var = __rdtsc())
# define SERIALIZE_INSTRUCTIONS() do { \
int cpuid_res_local[4]; \
__cpuid(cpuid_res_local, 0); \
}
/* no intrinsic available, and no inline asm support, so we have x86.asm routine */
byte * get_frame_ptr(void);
byte * get_stack_ptr(void);
# define GET_FRAME_PTR(var) (var = get_frame_ptr())
# define GET_STACK_PTR(var) (var = get_stack_ptr())
/* returns true if result value is zero */
static inline bool atomic_inc_and_test(volatile int *var) {
return (ATOMIC_INC(int, *(var)) == 0);
}
/* returns true if initial value was zero */
static inline bool atomic_dec_and_test(volatile int *var) {
return (ATOMIC_DEC(int, *(var)) == -1);
}
/* returns true if result value is zero */
static inline bool atomic_dec_becomes_zero(volatile int *var) {
return (ATOMIC_DEC(int, *(var)) == 0);
}
/* returns true if var was equal to compare */
static inline bool atomic_compare_exchange_int(volatile int *var,
int compare, int exchange) {
return (ATOMIC_COMPARE_EXCHANGE_int(*(var), compare, exchange) == (compare));
}
static inline bool atomic_compare_exchange_int64(volatile int64 *var,
int64 compare, int64 exchange) {
return ((ptr_int_t)ATOMIC_COMPARE_EXCHANGE_int64(*(var), compare, exchange) ==
(compare));
}
/* atomically adds value to memory location var and returns the sum */
static inline int atomic_add_exchange_int(volatile int *var, int value) {
return ((value) + ATOMIC_ADD(int, *(var), value));
}
# ifdef X64 /* 64-bit intrinsics only avail on x64 */
static inline int64 atomic_add_exchange_int64(volatile int64 *var, int64 value) {
return ((value) + ATOMIC_ADD(int64, *(var), value));
}
# endif
# define atomic_add_exchange atomic_add_exchange_int
#else /* UNIX */
# ifdef X86
/* IA-32 vol 3 7.1.4: processor will internally suppress the bus lock
* if target is within cache line.
*/
# define ATOMIC_4BYTE_WRITE(target, value, hot_patch) do { \
ASSERT(sizeof(value) == 4); \
/* test that we aren't crossing a cache line boundary */ \
CHECK_JMP_TARGET_ALIGNMENT(target, 4, hot_patch); \
/* we use xchgl instead of mov for non-4-byte-aligned writes */ \
__asm__ __volatile__("xchgl (%0), %1" : : "r" (target), "r" (value) : "memory"); \
} while (0)
# ifdef X64
# define ATOMIC_8BYTE_WRITE(target, value, hot_patch) do { \
ASSERT(sizeof(value) == 8); \
/* Not currently used to write code */ \
ASSERT_CURIOSITY(!hot_patch); \
/* test that we aren't crossing a cache line boundary */ \
CHECK_JMP_TARGET_ALIGNMENT(target, 8, hot_patch); \
__asm__ __volatile__("xchgq (%0), %1" : : "r" (target), "r" (value) : "memory"); \
} while (0)
# endif /* X64 */
# define ATOMIC_INC_suffix(suffix, var) \
__asm__ __volatile__("lock inc" suffix " %0" : "=m" (var) : : "memory")
# define ATOMIC_INC_int(var) ATOMIC_INC_suffix("l", var)
# define ATOMIC_INC_int64(var) ATOMIC_INC_suffix("q", var)
# define ATOMIC_DEC_suffix(suffix, var) \
__asm__ __volatile__("lock dec" suffix " %0" : "=m" (var) : : "memory")
# define ATOMIC_DEC_int(var) ATOMIC_DEC_suffix("l", var)
# define ATOMIC_DEC_int64(var) ATOMIC_DEC_suffix("q", var)
/* with just "r" gcc will put $0 from PROBE_WRITE_PC into %eax
* and then complain that "lock addq" can't take %eax!
* so we use "ri":
*/
# define ATOMIC_ADD_suffix(suffix, var, value) \
__asm__ __volatile__("lock add" suffix " %1, %0" \
: "=m" (var) : "ri" (value) : "memory")
# define ATOMIC_ADD_int(var, val) ATOMIC_ADD_suffix("l", var, val)
# define ATOMIC_ADD_int64(var, val) ATOMIC_ADD_suffix("q", var, val)
/* Not safe for general use, just for atomic_add_exchange(), undefed below */
# define ATOMIC_ADD_EXCHANGE_suffix(suffix, var, value, result) \
__asm__ __volatile__("lock xadd" suffix " %1, %0" \
: "=m" (*var), "=r" (result) : "1" (value) : "memory")
# define ATOMIC_ADD_EXCHANGE_int(var, val, res) \
ATOMIC_ADD_EXCHANGE_suffix("l", var, val, res)
# define ATOMIC_ADD_EXCHANGE_int64(var, val, res) \
ATOMIC_ADD_EXCHANGE_suffix("q", var, val, res)
# define ATOMIC_COMPARE_EXCHANGE_suffix(suffix, var, compare, exchange) \
__asm__ __volatile__ ("lock cmpxchg" suffix " %2,%0" \
: "=m" (var) \
: "a" (compare), "r" (exchange) \
: "memory")
# define ATOMIC_COMPARE_EXCHANGE_int(var, compare, exchange) \
ATOMIC_COMPARE_EXCHANGE_suffix("l", var, compare, exchange)
# define ATOMIC_COMPARE_EXCHANGE_int64(var, compare, exchange) \
ATOMIC_COMPARE_EXCHANGE_suffix("q", var, compare, exchange)
# define ATOMIC_EXCHANGE(var, newval, result) \
__asm __volatile ("xchgl %0, %1" \
: "=r" (result), "=m" (var) \
: "0" (newval), "m" (var))
# define SPINLOCK_PAUSE() __asm__ __volatile__("pause")
# define RDTSC_LL(llval) \
__asm__ __volatile__ \
("rdtsc" : "=A" (llval))
# define SERIALIZE_INSTRUCTIONS() \
__asm__ __volatile__ \
("xor %%eax, %%eax; cpuid" : : : "eax", "ebx", "ecx", "edx");
# define GET_FRAME_PTR(var) asm("mov %%"IF_X64_ELSE("rbp","ebp")", %0" : "=m"(var))
# define GET_STACK_PTR(var) asm("mov %%"IF_X64_ELSE("rsp","esp")", %0" : "=m"(var))
# define SET_FLAG(cc, flag) __asm__ __volatile__("set"#cc " %0" :"=qm" (flag) )
# define SET_IF_NOT_ZERO(flag) SET_FLAG(nz, flag)
# define SET_IF_NOT_LESS(flag) SET_FLAG(nl, flag)
# else /* ARM */
# define ATOMIC_4BYTE_WRITE(target, value, hot_patch) do { \
ASSERT(sizeof(value) == 4); \
/* Load and store instructions are atomic on ARM if aligned. */ \
/* FIXME i#1551: we need patch the whole instruction instead. */ \
ASSERT(ALIGNED(target, 4)); \
__asm__ __volatile__("str %0, [%1]" \
: : "r" (value), "r" (target) \
: "memory"); \
} while (0)
# ifdef X64
# define ATOMIC_8BYTE_WRITE(target, value, hot_patch) do { \
ASSERT(sizeof(value) == 8); \
/* Not currently used to write code */ \
ASSERT_CURIOSITY(!hot_patch); \
/* test that we aren't crossing a cache line boundary */ \
CHECK_JMP_TARGET_ALIGNMENT(target, 8, hot_patch); \
/* Load and store instructions are atomic on ARM if aligned */ \
/* FIXME i#1551: we need patch the whole instruction instead. */ \
ASSERT(ALIGNED(target, 4)); \
__asm__ __volatile__("strd %0, [%1]" \
: : "r" (value), "r" (target) \
: "memory"); \
} while (0)
# endif /* X64 */
/* OP_swp is deprecated and OP_ldrex and OP_strex are introduced in
* ARMv6 for ARM synchronization primitives
*/
/* The manual says "If SCTLR.A and SCTLR.U are both 0,
* a non word-aligned memory address causes UNPREDICTABLE behavior.",
* so we require alignment here.
*/
/* FIXME i#1551: should we allow the infinit loops for those ATOMIC ops */
# define ATOMIC_INC_suffix(suffix, var) \
__asm__ __volatile__( \
"1: ldrex" suffix " r2, %0 \n\t" \
" add" suffix " r2, r2, #1 \n\t" \
" strex" suffix " r3, r2, %0 \n\t" \
" cmp r3, #0 \n\t" \
" bne 1b" \
: "=Q" (var) /* no offset for ARM mode */ \
: : "cc", "memory", "r2", "r3");
# define ATOMIC_INC_int(var) ATOMIC_INC_suffix("", var)
# define ATOMIC_INC_int64(var) ATOMIC_INC_suffix("d", var)
# define ATOMIC_DEC_suffix(suffix, var) \
__asm__ __volatile__( \
"1: ldrex" suffix " r2, %0 \n\t" \
" sub" suffix " r2, r2, #1 \n\t" \
" strex" suffix " r3, r2, %0 \n\t" \
" cmp r3, #0 \n\t" \
" bne 1b" \
: "=Q" (var) /* no offset for ARM mode */ \
: : "cc", "memory", "r2", "r3");
# define ATOMIC_DEC_int(var) ATOMIC_DEC_suffix("", var)
# define ATOMIC_DEC_int64(var) ATOMIC_DEC_suffix("d", var)
# define ATOMIC_ADD_suffix(suffix, var, value) \
__asm__ __volatile__( \
"1: ldrex" suffix " r2, %0 \n\t" \
" add" suffix " r2, r2, %1 \n\t" \
" strex" suffix " r3, r2, %0 \n\t" \
" cmp r3, #0 \n\t" \
" bne 1b" \
: "=Q" (var) /* no offset for ARM mode */ \
: "r" (value) \
: "cc", "memory", "r2", "r3");
# define ATOMIC_ADD_int(var, val) ATOMIC_ADD_suffix("", var, val)
# define ATOMIC_ADD_int64(var, val) ATOMIC_ADD_suffix("q", var, val)
/* Not safe for general use, just for atomic_add_exchange(), undefed below */
# define ATOMIC_ADD_EXCHANGE_suffix(suffix, var, value, result) \
__asm__ __volatile__( \
"1: ldrex" suffix " r2, %0 \n\t" \
" add" suffix " r2, r2, %2 \n\t" \
" strex" suffix " r3, r2, %0 \n\t" \
" cmp r3, #0 \n\t" \
" bne 1b \n\t" \
" str" suffix " r2, %1" \
: "=Q" (var), "=m" (result) \
: "r" (value) \
: "cc", "memory", "r2", "r3");
# define ATOMIC_ADD_EXCHANGE_int(var, val, res) \
ATOMIC_ADD_EXCHANGE_suffix("", var, val, res)
# define ATOMIC_ADD_EXCHANGE_int64(var, val, res) \
ATOMIC_ADD_EXCHANGE_suffix("d", var, val, res)
# define ATOMIC_COMPARE_EXCHANGE_suffix(suffix, var, compare, exchange) \
__asm__ __volatile__( \
" ldrex" suffix " r2, %0 \n\t" \
" cmp" suffix " r2, %2 \n\t" \
" bne 1f \n\t" \
" strex" suffix " r3, %2, %0 \n\t" \
"1: clrex \n\t" \
: "=Q" (var) /* no offset for ARM mode */ \
: "r" (compare), "r" (exchange) \
: "cc", "memory", "r2", "r3");
# define ATOMIC_COMPARE_EXCHANGE_int(var, compare, exchange) \
ATOMIC_COMPARE_EXCHANGE_suffix("", var, compare, exchange)
# define ATOMIC_COMPARE_EXCHANGE_int64(var, compare, exchange) \
ATOMIC_COMPARE_EXCHANGE_suffix("d", var, compare, exchange)
# define ATOMIC_EXCHANGE(var, newval, result) \
__asm__ __volatile__( \
"1: ldrex r2, %0 \n\t" \
" strex r3, %2, %0 \n\t" \
" cmp r3, #0 \n\t" \
" bne 1b \n\t" \
" str r2, %1" \
: "=Q" (var), "=m" (result) \
: "r" (newval) \
: "cc", "memory", "r2", "r3");
# define SPINLOCK_PAUSE() __asm__ __volatile__("wfi") /* wait for interrupt */
/* FIXME i#1551: there is no RDTSC on ARM. */
# define RDTSC_LL(llval) do { \
ASSERT_NOT_IMPLEMENTED(false); \
(llval) = 0; \
} while (0)
# define SERIALIZE_INSTRUCTIONS() __asm__ __volatile__("clrex");
/* FIXME i#1551: frame pointer is r7 in thumb mode */
# define GET_FRAME_PTR(var) \
__asm__ __volatile__("str "IF_X64_ELSE("x29", "r11")", %0" : "=m"(var))
# define GET_STACK_PTR(var) __asm__ __volatile__("str sp, %0" : "=m"(var))
/* assuming flag is unsigned char */
# define SET_FLAG(cc, flag) \
__asm__ __volatile__( \
" mov r2, #1 \n\t" \
" b"#cc " 1f \n\t" \
" mov r2, #0 \n\t" \
"1: strb r2, %0" \
:"=m" (flag) : : "r2")
# define SET_IF_NOT_ZERO(flag) SET_FLAG(ne, flag)
# define SET_IF_NOT_LESS(flag) SET_FLAG(ge, flag)
# endif /* X86/ARM */
# define ATOMIC_INC(type, var) ATOMIC_INC_##type(var)
# define ATOMIC_DEC(type, var) ATOMIC_DEC_##type(var)
# define ATOMIC_ADD(type, var, val) ATOMIC_ADD_##type(var, val)
# ifdef X64
# define ATOMIC_ADD_PTR(type, var, val) ATOMIC_ADD_int64(var, val)
# else
# define ATOMIC_ADD_PTR(type, var, val) ATOMIC_ADD_int(var, val)
# endif
# define ATOMIC_COMPARE_EXCHANGE ATOMIC_COMPARE_EXCHANGE_int
# ifdef X64
# define ATOMIC_COMPARE_EXCHANGE_PTR ATOMIC_COMPARE_EXCHANGE_int64
# else
# define ATOMIC_COMPARE_EXCHANGE_PTR ATOMIC_COMPARE_EXCHANGE
# endif
/* Atomically increments *var by 1
* Returns true if the resulting value is zero, otherwise returns false
*/
static inline bool atomic_inc_and_test(volatile int *var)
{
unsigned char c;
ATOMIC_INC(int, *var);
/* flags should be set according to resulting value, now we convert that back to C */
SET_IF_NOT_ZERO(c);
/* FIXME: we add an extra memory reference to a local,
although we could put the return value in EAX ourselves */
return c == 0;
}
/* Atomically decrements *var by 1
* Returns true if the initial value was zero, otherwise returns false
*/
static inline bool atomic_dec_and_test(volatile int *var)
{
unsigned char c;
ATOMIC_DEC(int, *var);
/* result should be set according to value before change, now we convert that back to C */
SET_IF_NOT_LESS(c);
/* FIXME: we add an extra memory reference to a local,
although we could put the return value in EAX ourselves */
return c == 0;
}
/* Atomically decrements *var by 1
* Returns true if the resulting value is zero, otherwise returns false
*/
static inline bool atomic_dec_becomes_zero(volatile int *var)
{
unsigned char c;
ATOMIC_DEC(int, *var);
/* result should be set according to value after change, now we convert that back to C */
SET_IF_NOT_ZERO(c);
/* FIXME: we add an extra memory reference to a local,
although we could put the return value in EAX ourselves */
return c == 0;
}
/* returns true if var was equal to compare, and now is equal to exchange,
otherwise returns false
*/
static inline bool atomic_compare_exchange_int(volatile int *var,
int compare, int exchange)
{
unsigned char c;
ATOMIC_COMPARE_EXCHANGE(*var, compare, exchange);
/* ZF is set if matched, all other flags are as if a normal compare happened */
/* we convert ZF value back to C */
SET_IF_NOT_ZERO(c);
/* FIXME: we add an extra memory reference to a local,
although we could put the return value in EAX ourselves */
return c == 0;
}
/* exchanges *var with newval and returns original *var */
static inline int
atomic_exchange_int(volatile int *var, int newval)
{
int result;
ATOMIC_EXCHANGE(*var, newval, result);
return result;
}
#ifdef X64
/* returns true if var was equal to compare, and now is equal to exchange,
otherwise returns false
*/
static inline bool atomic_compare_exchange_int64(volatile int64 *var,
int64 compare,
int64 exchange)
{
unsigned char c;
ATOMIC_COMPARE_EXCHANGE_int64(*var, compare, exchange);
/* ZF is set if matched, all other flags are as if a normal compare happened */
/* we convert ZF value back to C */
SET_IF_NOT_ZERO(c);
/* FIXME: we add an extra memory reference to a local,
although we could put the return value in EAX ourselves */
return c == 0;
}
#endif
/* atomically adds value to memory location var and returns the sum */
static inline int atomic_add_exchange_int(volatile int *var, int value)
{
int temp;
ATOMIC_ADD_EXCHANGE_int(var, value, temp);
return (temp + value);
}
static inline int64 atomic_add_exchange_int64(volatile int64 *var, int64 value)
{
int64 temp;
ATOMIC_ADD_EXCHANGE_int64(var, value, temp);
return (temp + value);
}
# define atomic_add_exchange atomic_add_exchange_int
# undef ATOMIC_ADD_EXCHANGE_suffix
# undef ATOMIC_ADD_EXCHANGE_int
# undef ATOMIC_ADD_EXCHANGE_int64
#endif /* UNIX */
#define atomic_compare_exchange atomic_compare_exchange_int
#ifdef X64
# define atomic_compare_exchange_ptr(v, c, e) \
atomic_compare_exchange_int64((volatile int64 *)(v), (int64)(c), (int64)(e))
# define ATOMIC_ADDR_WRITE ATOMIC_8BYTE_WRITE
#else
# define atomic_compare_exchange_ptr(v, c, e) \
atomic_compare_exchange_int((volatile int *)(v), (int)(c), (int)(e))
# define ATOMIC_ADDR_WRITE ATOMIC_4BYTE_WRITE
#endif
#define ATOMIC_MAX_int(type, maxvar, curvar) do { \
type atomic_max__maxval; \
type atomic_max__curval = (curvar); \
ASSERT(sizeof(int) == sizeof(maxvar)); \
ASSERT(sizeof(type) == sizeof(maxvar)); \
ASSERT(sizeof(type) == sizeof(curvar)); \
do { \
atomic_max__maxval = (maxvar); \
} while (atomic_max__maxval < atomic_max__curval && \
!atomic_compare_exchange_int((int*)&(maxvar), \
atomic_max__maxval, \
atomic_max__curval)); \
} while (0)
#ifdef X64 /* 64-bit intrinsics only avail on x64 */
# define ATOMIC_MAX_int64(type, maxvar, curvar) do { \
type atomic_max__maxval; \
type atomic_max__curval = (curvar); \
ASSERT(sizeof(int64) == sizeof(maxvar)); \
ASSERT(sizeof(type) == sizeof(maxvar)); \
ASSERT(sizeof(type) == sizeof(curvar)); \
do { \
atomic_max__maxval = (maxvar); \
} while (atomic_max__maxval < atomic_max__curval && \
!atomic_compare_exchange_int64((int64*)&(maxvar), \
atomic_max__maxval, \
atomic_max__curval)); \
} while (0)
#endif
#define ATOMIC_MAX(type, maxvar, curvar) ATOMIC_MAX_##type(type, maxvar, curvar)
#define DEBUGGER_INTERRUPT_BYTE 0xcc
/* if hot_patch is true:
* The write that inserts the relative target is done atomically so this
* function is safe with respect to a thread executing the code containing
* this target, presuming that the code in both the before and after states
* is valid
*/
byte *
insert_relative_target(byte *pc, cache_pc target, bool hot_patch);
byte *
insert_relative_jump(byte *pc, cache_pc target, bool hot_patch);
/* in arch.c */
#ifdef PROFILE_RDTSC
#ifdef UNIX
/* This only works on Pentium I or later */
__inline__ uint64 get_time();
#else /* WINDOWS */
/* This only works on Pentium I or later */
uint64 get_time(void);
#endif
#endif
void arch_init(void);
void arch_exit(IF_WINDOWS_ELSE_NP(bool detach_stacked_callbacks, void));
void arch_thread_init(dcontext_t *dcontext);
void arch_thread_exit(dcontext_t *dcontext _IF_WINDOWS(bool detach_stacked_callbacks));
#if defined(WINDOWS_PC_SAMPLE) && !defined(DEBUG)
/* for sampling fast exit path */
void arch_thread_profile_exit(dcontext_t *dcontext);
void arch_profile_exit(void);
#endif
byte *
code_align_forward(byte *pc, size_t alignment);
bool is_indirect_branch_lookup_routine(dcontext_t *dcontext, cache_pc pc);
bool in_generated_routine(dcontext_t *dcontext, cache_pc pc);
bool in_context_switch_code(dcontext_t *dcontext, cache_pc pc);
bool in_indirect_branch_lookup_code(dcontext_t *dcontext, cache_pc pc);
cache_pc get_fcache_target(dcontext_t *dcontext);
void set_fcache_target(dcontext_t *dcontext, cache_pc value);
void copy_mcontext(priv_mcontext_t *src, priv_mcontext_t *dst);
bool dr_mcontext_to_priv_mcontext(priv_mcontext_t *dst, dr_mcontext_t *src);
bool priv_mcontext_to_dr_mcontext(dr_mcontext_t *dst, priv_mcontext_t *src);
priv_mcontext_t *dr_mcontext_as_priv_mcontext(dr_mcontext_t *mc);
priv_mcontext_t *get_priv_mcontext_from_dstack(dcontext_t *dcontext);
void dr_mcontext_init(dr_mcontext_t *mc);
void dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml);
const char *get_branch_type_name(ibl_branch_type_t branch_type);
ibl_branch_type_t get_ibl_branch_type(instr_t *instr);
/* Return the entry point for a routine with which an
* atomic hashtable delete can be performed for the given fragment.
*/
cache_pc
get_target_delete_entry_pc(dcontext_t *dcontext,
struct _ibl_table_t *table);
cache_pc get_reset_exit_stub(dcontext_t *dcontext);
typedef linkstub_t * (* fcache_enter_func_t) (dcontext_t *dcontext);
fcache_enter_func_t get_fcache_enter_private_routine(dcontext_t *dcontext);
cache_pc get_unlinked_entry(dcontext_t *dcontext, cache_pc linked_entry);
cache_pc get_linked_entry(dcontext_t *dcontext, cache_pc unlinked_entry);
#ifdef X64
cache_pc get_trace_cmp_entry(dcontext_t *dcontext, cache_pc linked_entry);
#endif
cache_pc get_do_syscall_entry(dcontext_t *dcontext);
#ifdef WINDOWS
fcache_enter_func_t get_fcache_enter_indirect_routine(dcontext_t *dcontext);
cache_pc get_do_callback_return_entry(dcontext_t *dcontext);
#else
cache_pc get_do_int_syscall_entry(dcontext_t *dcontext);
cache_pc get_do_int81_syscall_entry(dcontext_t *dcontext);
cache_pc get_do_int82_syscall_entry(dcontext_t *dcontext);
cache_pc get_do_clone_syscall_entry(dcontext_t *dcontext);
# ifdef VMX86_SERVER
cache_pc get_do_vmkuw_syscall_entry(dcontext_t *dcontext);
# endif
#endif
byte * get_global_do_syscall_entry(void);
/* NOTE - because of the sygate int 2e hack, after_do_syscall_addr and
* after_shared_syscall_addr in fact both check for the same address with
* int system calls, so can't use them to disambiguate between the two. */
#ifdef WINDOWS
/* For int system calls
* - addr is the instruction immediately after the int 2e
* - code is the addr of our code that handles post int 2e
* For non int system calls addr and code will be the same (ret addr), this
* is just for working around our Sygate int 2e in ntdll hack (5217) */
cache_pc after_shared_syscall_addr(dcontext_t *dcontext);
cache_pc after_shared_syscall_code(dcontext_t *dcontext);
bool is_shared_syscall_routine(dcontext_t *dcontext, cache_pc pc);
/* For int system calls
* - addr is the instruction immediately after the int 2e
* - code is the addr of our code that handles post int 2e
* For non int system calls addr and code will be the same (ret addr), this
* is just for working around our Sygate int 2e in ntdll hack (5217) */
cache_pc after_do_syscall_addr(dcontext_t *dcontext);
cache_pc after_do_syscall_code(dcontext_t *dcontext);
#else
cache_pc after_do_shared_syscall_addr(dcontext_t *dcontext);
cache_pc after_do_syscall_addr(dcontext_t *dcontext);
bool is_after_main_do_syscall_addr(dcontext_t *dcontext, cache_pc pc);
bool is_after_do_syscall_addr(dcontext_t *dcontext, cache_pc pc);
#endif
bool is_after_syscall_address(dcontext_t *dcontext, cache_pc pc);
bool is_after_syscall_that_rets(dcontext_t *dcontext, cache_pc pc);
void update_generated_hashtable_access(dcontext_t *dcontext);
fcache_enter_func_t get_fcache_enter_shared_routine(dcontext_t *dcontext);
/* Method of performing system call.
* We assume that only one method is in use, except for 32-bit applications
* on 64-bit x86 linux kernels, which use both sys{enter,call} on the vsyscall
* page and inlined int (PR 286922).
* For these apps, DR itself and global_do_syscall use int, but we
* have both a do_syscall for the vsyscall and a separate do_int_syscall
* (we can't use the vsyscall for some system calls like clone: we could
* potentially use int for everything if we fixed up the syscall args).
* The method set in that case is the vsyscall method.
*/
enum {
SYSCALL_METHOD_UNINITIALIZED,
SYSCALL_METHOD_INT,
SYSCALL_METHOD_SYSENTER,
SYSCALL_METHOD_SYSCALL,
#ifdef WINDOWS
SYSCALL_METHOD_WOW64,
#endif
SYSCALL_METHOD_SVC, /* ARM */
};
#ifdef UNIX
enum { SYSCALL_METHOD_LONGEST_INSTR = 2 }; /* to ensure safe patching */
#endif
void check_syscall_method(dcontext_t *dcontext, instr_t *instr);
int get_syscall_method(void);
/* Does the syscall instruction always return to the invocation point? */
bool does_syscall_ret_to_callsite(void);
void set_syscall_method(int method);
#ifdef LINUX
bool should_syscall_method_be_sysenter(void);
#endif
/* returns the address of the first app syscall instruction we saw (see hack
* in win32/os.c that uses this for PRE_SYSCALL_PC, not for general use */
byte *get_app_sysenter_addr(void);
/* in [x86/arm].asm */
/* Calls the specified function 'func' after switching to the stack 'stack'. If we're
* currently on the initstack 'mutex_to_free' should be passed so we release the
* initstack lock. The supplied 'dcontext' will be passed as an argument to 'func'.
* If 'func' returns then 'return_on_return' is checked. If set we swap back stacks and
* return to the caller. If not set then it's assumed that func wasn't supposed to
* return and we go to an error routine unexpected_return() below.
*/
void call_switch_stack(dcontext_t *dcontext, byte *stack, void (*func) (dcontext_t *),
void *mutex_to_free, bool return_on_return);
# if defined (WINDOWS) && !defined(X64)
DYNAMORIO_EXPORT int64
dr_invoke_x64_routine(dr_auxlib64_routine_ptr_t func64, uint num_params, ...);
# endif
void unexpected_return(void);
void clone_and_swap_stack(byte *stack, byte *tos);
void go_native(dcontext_t *dcontext);
void cleanup_and_terminate(dcontext_t *dcontext, int sysnum,
ptr_uint_t sys_arg1, ptr_uint_t sys_arg2, bool exitproc,
/* these 2 args are only used for Mac thread exit */
ptr_uint_t sys_arg3, ptr_uint_t sys_arg4);
bool cpuid_supported(void);
void our_cpuid(int res[4], int eax);
#ifdef WINDOWS
int dynamorio_syscall_int2e(int sysnum, ...);
int dynamorio_syscall_sysenter(int sysnum, ...);
int dynamorio_syscall_sygate_int2e(int sysnum, ...);
int dynamorio_syscall_sygate_sysenter(int sysnum, ...);
# ifdef X64
int dynamorio_syscall_syscall(int sysnum, ...);
# endif
int dynamorio_syscall_wow64(int sysnum, ...);
/* Use this version if !syscall_uses_edx_param_base() */
int dynamorio_syscall_wow64_noedx(int sysnum, ...);
void get_segments_cs_ss(cxt_seg_t *cs, cxt_seg_t *ss);
void get_segments_defg(cxt_seg_t *ds, cxt_seg_t *es, cxt_seg_t *fs, cxt_seg_t *gs);
void get_own_context_helper(CONTEXT *cxt);
# ifdef STACK_GUARD_PAGE
/* PR203701: If the dstack is exhausted we'll use this function to
* call internal_exception_info() with a separate exception stack.
*/
void call_intr_excpt_alt_stack(dcontext_t *dcontext, EXCEPTION_RECORD *pExcptRec,
CONTEXT *cxt, byte *stack);
# endif
void dynamorio_earliest_init_takeover(void);
#else /* UNIX */
void client_int_syscall(void);
# ifdef MACOS
/* Some 32-bit syscalls return 64-bit values (e.g., SYS_lseek) in eax:edx */
int64 dynamorio_syscall(uint sysnum, uint num_args, ...);
int64 dynamorio_mach_dep_syscall(uint sysnum, uint num_args, ...);
ptr_int_t dynamorio_mach_syscall(uint sysnum, uint num_args, ...);
# else
ptr_int_t dynamorio_syscall(uint sysnum, uint num_args, ...);
# endif
void dynamorio_sigreturn(void);
void dynamorio_sys_exit(void);
# ifdef MACOS
void dynamorio_semaphore_signal_all(KSYNCH_TYPE *ksynch/*in xax*/);
# endif
# ifdef LINUX
void dynamorio_futex_wake_and_exit(volatile int *futex/* in xax*/);
# ifndef X64
void dynamorio_nonrt_sigreturn(void);
# endif
# endif
# ifdef LINUX
thread_id_t dynamorio_clone(uint flags, byte *newsp, void *ptid, void *tls,
void *ctid, void (*func)(void));
# endif
# ifdef MACOS
void new_bsdthread_intercept(void);
# endif
#endif
void back_from_native(void);
/* These two are labels, not functions. */
void back_from_native_retstubs(void);
void back_from_native_retstubs_end(void);
/* Each stub should be 4 bytes: push imm8 + jmp rel8 */
enum { BACK_FROM_NATIVE_RETSTUB_SIZE = 4 };
#ifdef UNIX
void native_plt_call(void);
#endif
DEBUG_DECLARE(void debug_infinite_loop(void); /* handy cpu eating infinite loop */)
void hashlookup_null_handler(void);
void dr_stmxcsr(uint *val);
void dr_xgetbv(uint *high, uint *low);
void dr_fxsave(byte *buf_aligned);
void dr_fnsave(byte *buf_aligned);
void dr_fxrstor(byte *buf_aligned);
void dr_frstor(byte *buf_aligned);
#ifdef X64
void dr_fxsave32(byte *buf_aligned);
void dr_fxrstor32(byte *buf_aligned);
#endif
void dr_fpu_exception_init(void);
/* Keep in synch with x86.asm. This is the difference between the SP saved in
* the mcontext and the SP of the caller of dr_app_start() and
* dynamorio_app_take_over().
*/
#define DYNAMO_START_XSP_ADJUST 16
/* x86_code.c */
void dynamo_start(priv_mcontext_t *mc);
/* Gets the retstack index saved in x86.asm and restores the mcontext to the
* original app state.
*/
int
native_get_retstack_idx(priv_mcontext_t *mc);
/* in proc.c -- everything in proc.h is exported so just include it here */
#include "proc.h"
/* in disassemble.c */
#ifdef DEBUG /* because uses logfile */
void disassemble_fragment(dcontext_t *dcontext, fragment_t *f, bool just_header);
void disassemble_app_bb(dcontext_t *dcontext, app_pc tag, file_t outfile);
/* dumps callstack for ebp stored in mcontext */
void dump_mcontext_callstack(dcontext_t *dcontext);
#endif
/* flags for dump_callstack_to_buffer */
enum {
CALLSTACK_USE_XML = 0x00000001,
CALLSTACK_ADD_HEADER = 0x00000002,
CALLSTACK_MODULE_INFO = 0x00000004,
CALLSTACK_MODULE_PATH = 0x00000008,
CALLSTACK_FRAME_PTR = 0x00000010,
};
/* dumps callstack for current pc and ebp */
void
dump_dr_callstack(file_t outfile);
/* user-specified ebp */
void
dump_callstack(app_pc pc, app_pc ebp, file_t outfile, bool dump_xml);
void
dump_callstack_to_buffer(char *buf, size_t bufsz, size_t *sofar,
app_pc pc, app_pc ebp, uint flags /*CALLSTACK_ bitmask*/);
#if defined(INTERNAL) || defined(DEBUG) || defined(CLIENT_INTERFACE)
void disassemble_fragment_header(dcontext_t *dcontext, fragment_t *f, file_t outfile);
void disassemble_fragment_body(dcontext_t *dcontext, fragment_t *f, file_t outfile);
void disassemble_app_bb(dcontext_t *dcontext, app_pc tag, file_t outfile);
#endif /* INTERNAL || DEBUG || CLIENT_INTERFACE */
/* in emit_utils.c */
static inline bool
use_addr_prefix_on_short_disp(void)
{
#ifdef STANDALONE_DECODER
/* Not worth providing control over this. Go w/ most likely best choice. */
return false;
#else
/* -ibl_addr_prefix => addr prefix everywhere */
return (DYNAMO_OPTION(ibl_addr_prefix) ||
/* PR 212807, PR 209709: addr prefix is noticeably worse
* on Pentium M, Core, and Core2.
* It's better on Pentium 4 and Pentium D.
*
* Note that this variation by processor type does not need to
* be stored in pcaches b/c either way works and the size is
* not assumed (except for prefixes: but coarse_units doesn't
* support prefixes in general).
*/
/* P4 and PD */
(proc_get_family() == FAMILY_PENTIUM_4 ||
/* PPro, P2, P3, but not PM */
(proc_get_family() == FAMILY_PENTIUM_3 &&
(proc_get_model() <= 8 ||
proc_get_model() == 10 ||
proc_get_model() == 11))));
/* FIXME: should similarly remove addr prefixes from hardcoded
* emits in emit_utils.c, except in cases where space is more
* important than speed.
* FIXME: case 5231 long term solution should properly choose
* - ibl - speed
* - prefixes - speed/space?
* - app code - preserverd since we normally don't need to reencode,
* unless it is a CTI that goes through FS - should be preserved too
* - direct stubs - space
* - indirect stubs - speed/space?
* - enter/exit - speed?
* - interception routines - speed?
*/
#endif /* STANDALONE_DECODER */
}
/* Merge w/ _LENGTH enum below? */
/* not ifdef X64 to simplify code */
#define SIZE64_MOV_XAX_TO_TLS 8
#define SIZE64_MOV_XBX_TO_TLS 9
#define SIZE64_MOV_PTR_IMM_TO_XAX 10
#define SIZE64_MOV_PTR_IMM_TO_TLS (12*2) /* high and low 32 bits separately */
#define SIZE64_MOV_R8_TO_XAX 3
#define SIZE64_MOV_R9_TO_XCX 3
#define SIZE32_MOV_XAX_TO_TLS 5
#define SIZE32_MOV_XBX_TO_TLS 6
#define SIZE32_MOV_XAX_TO_TLS_DISP32 6
#define SIZE32_MOV_XBX_TO_TLS_DISP32 7
#define SIZE32_MOV_XAX_TO_ABS 5
#define SIZE32_MOV_XBX_TO_ABS 6
#define SIZE32_MOV_PTR_IMM_TO_XAX 5
#define SIZE32_MOV_PTR_IMM_TO_TLS 10
#ifdef X64
# define FRAG_IS_32(flags) (TEST(FRAG_32_BIT, (flags)))
# define FRAG_IS_X86_TO_X64(flags) (TEST(FRAG_X86_TO_X64, (flags)))
#else
# define FRAG_IS_32(flags) true
# define FRAG_IS_X86_TO_X64(flags) false
#endif
#define SIZE_MOV_XAX_TO_TLS(flags, require_addr16) \
(FRAG_IS_32(flags) ? \
((require_addr16 || use_addr_prefix_on_short_disp()) ? \
SIZE32_MOV_XAX_TO_TLS : SIZE32_MOV_XAX_TO_TLS_DISP32) : \
SIZE64_MOV_XAX_TO_TLS)
#define SIZE_MOV_XBX_TO_TLS(flags, require_addr16) \
(FRAG_IS_32(flags) ? \
((require_addr16 || use_addr_prefix_on_short_disp()) ? \
SIZE32_MOV_XBX_TO_TLS : SIZE32_MOV_XBX_TO_TLS_DISP32) : \
SIZE64_MOV_XBX_TO_TLS)
#define SIZE_MOV_PTR_IMM_TO_XAX(flags) \
(FRAG_IS_32(flags) ? SIZE32_MOV_PTR_IMM_TO_XAX : SIZE64_MOV_PTR_IMM_TO_XAX)
/* size of restore ecx prefix */
#define XCX_IN_TLS(flags) (DYNAMO_OPTION(private_ib_in_tls) || TEST(FRAG_SHARED, (flags)))
#define FRAGMENT_BASE_PREFIX_SIZE(flags) \
((FRAG_IS_X86_TO_X64(flags) && \
IF_X64_ELSE(DYNAMO_OPTION(x86_to_x64_ibl_opt), false)) ? \
SIZE64_MOV_R9_TO_XCX : \
(XCX_IN_TLS(flags) ? SIZE_MOV_XBX_TO_TLS(flags, false) : SIZE32_MOV_XBX_TO_ABS))
/* exported for DYNAMO_OPTION(separate_private_stubs)
* FIXME: find better way to export -- would use global var accessed
* by macro, but easiest to have as static initializer for heap bucket
*/
/* for -thread_private, we're relying on the fact that
* SIZE32_MOV_XAX_TO_TLS == SIZE32_MOV_XAX_TO_ABS, and that
* x64 always uses tls
*/
#define DIRECT_EXIT_STUB_SIZE32 \
(SIZE32_MOV_XAX_TO_TLS + SIZE32_MOV_PTR_IMM_TO_XAX + JMP_LONG_LENGTH)
#define DIRECT_EXIT_STUB_SIZE64 \
(SIZE64_MOV_XAX_TO_TLS + SIZE64_MOV_PTR_IMM_TO_XAX + JMP_LONG_LENGTH)
#define DIRECT_EXIT_STUB_SIZE(flags) \
(FRAG_IS_32(flags) ? DIRECT_EXIT_STUB_SIZE32 : DIRECT_EXIT_STUB_SIZE64)
/* coarse-grain stubs use a store directly to memory so they can
* link through the stub and not mess up app state.
* 1st instr example:
* 67 64 c7 06 e0 0e 02 99 4e 7d addr16 mov $0x7d4e9902 -> %fs:0x0ee0
* 64-bit is split into high and low dwords:
* 65 c7 04 25 20 16 00 00 02 99 4e 7d mov $0x7d4e9902 -> %gs:0x1620
* 65 c7 04 25 24 16 00 00 00 00 00 00 mov $0x00000000 -> %gs:0x1624
* both of these exact sequences are assumed in entrance_stub_target_tag()
* and coarse_indirect_stub_jmp_target().
*/
#define STUB_COARSE_DIRECT_SIZE32 (SIZE32_MOV_PTR_IMM_TO_TLS + JMP_LONG_LENGTH)
#define STUB_COARSE_DIRECT_SIZE64 (SIZE64_MOV_PTR_IMM_TO_TLS + JMP_LONG_LENGTH)
#define STUB_COARSE_DIRECT_SIZE(flags) \
(FRAG_IS_32(flags) ? STUB_COARSE_DIRECT_SIZE32 : STUB_COARSE_DIRECT_SIZE64)
/* writes nops into the address range */
#define SET_TO_NOPS(addr, size) memset(addr, 0x90, size)
/* writes debugbreaks into the address range */
#define SET_TO_DEBUG(addr, size) memset(addr, 0xcc, size)
/* check if region is SET_TO_NOP */
#define IS_SET_TO_NOP(addr, size) is_region_memset_to_char(addr, size, 0x90)
/* check if region is SET_TO_DEBUG */
#define IS_SET_TO_DEBUG(addr, size) is_region_memset_to_char(addr, size, 0xcc)
/* offset of the patchable region from the end of a cti */
#define CTI_PATCH_OFFSET 4
/* size of the patch to a cti */
#define CTI_PATCH_SIZE 4
/* offset of the patchable region from the end of a stub */
#define EXIT_STUB_PATCH_OFFSET 4
/* size of the patch to a stub */
#define EXIT_STUB_PATCH_SIZE 4
/* the most bytes we'll need to shift a patchable location for -pad_jmps */
#define MAX_PAD_SIZE 3
/* evaluates to true if region crosses at most 1 padding boundary */
#define WITHIN_PAD_REGION(lower, upper) \
((upper)-(lower) <= PAD_JMPS_ALIGNMENT)
#define STATS_PAD_JMPS_ADD(flags, stat, val) DOSTATS({ \
if (TEST(FRAG_SHARED, (flags))) { \
if (TEST(FRAG_IS_TRACE, (flags))) \
STATS_ADD(pad_jmps_shared_trace_##stat, val); \
else \
STATS_ADD(pad_jmps_shared_bb_##stat, val); \
} \
else if (TEST(FRAG_IS_TRACE, (flags))) \
STATS_ADD(pad_jmps_trace_##stat, val); \
else if (TEST(FRAG_TEMP_PRIVATE, (flags))) \
STATS_ADD(pad_jmps_temp_##stat, val); \
else \
STATS_ADD(pad_jmps_bb_##stat, val); \
})
bool
is_exit_cti_stub_patchable(dcontext_t *dcontext, instr_t *inst,
uint frag_flags);
uint
extend_trace_pad_bytes(fragment_t *add_frag);
uint
patchable_exit_cti_align_offs(dcontext_t *dcontext, instr_t *inst, cache_pc pc);
bool
is_patchable_exit_stub(dcontext_t *dcontext, linkstub_t *l, fragment_t *f);
uint
bytes_for_exitstub_alignment(dcontext_t *dcontext, linkstub_t *l, fragment_t *f,
byte *startpc);
byte *
pad_for_exitstub_alignment(dcontext_t *dcontext, linkstub_t *l, fragment_t *f,
byte *startpc);
void
remove_nops_from_ilist(dcontext_t *dcontext, instrlist_t *ilist
_IF_DEBUG(bool recreating));
uint
nop_pad_ilist(dcontext_t *dcontext, fragment_t *f, instrlist_t *ilist, bool emitting);
bool
is_exit_cti_patchable(dcontext_t *dcontext, instr_t *inst, uint frag_flags);
int exit_stub_size(dcontext_t *dcontext, cache_pc target, uint flags);
int insert_exit_stub(dcontext_t *dcontext, fragment_t *f,
linkstub_t *l, cache_pc stub_pc);
int insert_exit_stub_other_flags(dcontext_t *dcontext, fragment_t *f, linkstub_t *l,
cache_pc stub_pc, ushort l_flags);
int
linkstub_unlink_entry_offset(dcontext_t *dcontext, fragment_t *f, linkstub_t *l);
cache_pc
indirect_linkstub_stub_pc(dcontext_t *dcontext, fragment_t *f, linkstub_t *l);
cache_pc
indirect_linkstub_target(dcontext_t *dcontext, fragment_t *f, linkstub_t *l);
/* based on machine state, returns which of l1 and l2 must have been taken */
linkstub_t *
linkstub_cbr_disambiguate(dcontext_t *dcontext, fragment_t *f,
linkstub_t *l1, linkstub_t *l2);
cache_pc
cbr_fallthrough_exit_cti(cache_pc prev_cti_pc);
/* for use with patch_branch and insert_relative target */
enum {
NOT_HOT_PATCHABLE=false,
HOT_PATCHABLE=true
};
void patch_branch(cache_pc branch_pc, cache_pc target_pc, bool hot_patch);
bool link_direct_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l,
fragment_t *targetf, bool hot_patch);
void unlink_direct_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l);
void link_indirect_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l,
bool hot_patch);
void unlink_indirect_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l);
void insert_fragment_prefix(dcontext_t *dcontext, fragment_t *f);
int fragment_prefix_size(uint flags);
void update_indirect_exit_stub(dcontext_t *dcontext, fragment_t *f, linkstub_t *l);
#ifdef PROFILE_RDTSC
uint profile_call_size(void);
void insert_profile_call(cache_pc start_pc);
void finalize_profile_call(dcontext_t *dcontext, fragment_t *f);
#endif
int decode_syscall_num(dcontext_t *dcontext, byte *entry);
#ifdef WINDOWS
void link_shared_syscall(dcontext_t *dcontext);
void unlink_shared_syscall(dcontext_t *dcontext);
#endif
/* Coarse-grain fragment support */
cache_pc
entrance_stub_jmp(cache_pc stub);
cache_pc
entrance_stub_jmp_target(cache_pc stub);
app_pc
entrance_stub_target_tag(cache_pc stub, coarse_info_t *info);
bool
coarse_is_indirect_stub(cache_pc stub);
bool
coarse_cti_is_intra_fragment(dcontext_t *dcontext, coarse_info_t *info,
instr_t *inst, cache_pc start_pc);
cache_pc
coarse_indirect_stub_jmp_target(cache_pc stub);
uint
coarse_indirect_stub_size(coarse_info_t *info);
bool
coarse_is_entrance_stub(cache_pc stub);
bool
coarse_is_trace_head(cache_pc stub);
bool
entrance_stub_linked(cache_pc stub, coarse_info_t *info /*OPTIONAL*/);
void
link_entrance_stub(dcontext_t *dcontext, cache_pc stub, cache_pc tgt,
bool hot_patch, coarse_info_t *info /*OPTIONAL*/);
void
unlink_entrance_stub(dcontext_t *dcontext, cache_pc stub, uint flags,
coarse_info_t *info /*OPTIONAL*/);
cache_pc
entrance_stub_from_cti(cache_pc cti);
uint
coarse_exit_prefix_size(coarse_info_t *info);
byte *
emit_coarse_exit_prefix(dcontext_t *dcontext, byte *pc, coarse_info_t *info);
/* Update info pointer in exit prefixes */
void
patch_coarse_exit_prefix(dcontext_t *dcontext, coarse_info_t *info);
bool
special_ibl_xfer_is_thread_private(void);
void
link_special_ibl_xfer(dcontext_t *dcontext);
void
unlink_special_ibl_xfer(dcontext_t *dcontext);
#ifdef CLIENT_INTERFACE
cache_pc
get_client_ibl_xfer_entry(dcontext_t *dcontext);
#endif
#ifdef UNIX
cache_pc
get_native_plt_ibl_xfer_entry(dcontext_t *dcontext);
cache_pc
get_native_ret_ibl_xfer_entry(dcontext_t *dcontext);
#endif
enum {
MAX_INSTR_LENGTH = 17,
/* size of 32-bit-offset jcc instr, assuming it has no
* jcc branch hint!
*/
CBR_LONG_LENGTH = 6,
JMP_LONG_LENGTH = 5,
JMP_SHORT_LENGTH = 2,
CBR_SHORT_REWRITE_LENGTH = 9, /* FIXME: use this in mangle.c */
RET_0_LENGTH = 1,
PUSH_IMM32_LENGTH = 5,
/* size of 32-bit call and jmp instructions w/o prefixes. */
CTI_IND1_LENGTH = 2, /* FF D6 call esi */
CTI_IND2_LENGTH = 3, /* FF 14 9E call dword ptr [esi+ebx*4] */
CTI_IND3_LENGTH = 4, /* FF 54 B3 08 call dword ptr [ebx+esi*4+8] */
CTI_DIRECT_LENGTH = 5, /* E8 9A 0E 00 00 call 7C8024CB */
CTI_IAT_LENGTH = 6, /* FF 15 38 10 80 7C call dword ptr ds:[7C801038h] */
CTI_FAR_ABS_LENGTH = 7, /* 9A 1B 07 00 34 39 call 0739:3400071B */
/* 07 */
INT_LENGTH = 2,
SYSCALL_LENGTH = 2,
SYSENTER_LENGTH = 2,
SVC_THUMB_LENGTH = 2, /* Thumb syscall instr */
SVC_ARM_LENGTH = 4, /* ARM syscall instr */
};
#define REL32_REACHABLE_OFFS(offs) ((offs) <= INT_MAX && (offs) >= INT_MIN)
/* source should be the end of a rip-relative-referencing instr */
#define REL32_REACHABLE(source, target) REL32_REACHABLE_OFFS((target) - (source))
/* If code_buf points to a jmp rel32 returns true and returns the target of
* the jmp in jmp_target as if was located at app_loc. */
bool
is_jmp_rel32(byte *code_buf, app_pc app_loc, app_pc *jmp_target /* OUT */);
/* If code_buf points to a jmp rel8 returns true and returns the target of
* the jmp in jmp_target as if was located at app_loc. */
bool
is_jmp_rel8(byte *code_buf, app_pc app_loc, app_pc *jmp_target /* OUT */);
/* in interp.c */
/* An upper bound on instructions added to a bb when added to a trace,
* which is of course highest for the case of indirect branch mangling.
* Normal lea, jecxz, lea is 14, NATIVE_RETURN (now removed) could get above 20,
* but this should cover everything, fine to be well above, this is
* only used to keep below the maximum trace size for the next bb,
* we calculate the exact size in fixup_last_cti().
*
* For x64 we have to increase this (PR 333576 hit this):
* +19 L4 65 48 89 0c 25 10 00 mov %rcx -> %gs:0x10
* 00 00
* +28 L4 48 8b c8 mov %rax -> %rcx
* +31 L4 e9 1b e2 f6 ff jmp $0x00000000406536e0 <shared_bb_ibl_indjmp>
* (+36)
* =>
* +120 L0 65 48 89 0c 25 10 00 mov %rcx -> %gs:0x10
* 00 00
* +129 L0 48 8b c8 mov %rax -> %rcx
* +132 L3 65 48 a3 00 00 00 00 mov %rax -> %gs:0x00
* 00 00 00 00
* +143 L3 48 b8 23 24 93 28 00 mov $0x0000000028932423 -> %rax
* 00 00 00
* +153 L3 65 48 a3 08 00 00 00 mov %rax -> %gs:0x08
* 00 00 00 00
* +164 L3 9f lahf -> %ah
* +165 L3 0f 90 c0 seto -> %al
* +168 L3 65 48 3b 0c 25 08 00 cmp %rcx %gs:0x08
* 00 00
* +177 L4 0f 85 a9 d7 f6 ff jnz $0x000000004065312f <shared_trace_cmp_indjmp>
* +183 L3 65 48 8b 0c 25 10 00 mov %gs:0x10 -> %rcx
* 00 00
* +192 L3 04 7f add $0x7f %al -> %al
* +194 L3 9e sahf %ah
* +195 L3 65 48 a1 00 00 00 00 mov %gs:0x00 -> %rax
* 00 00 00 00
* +206
*
* (36-19)=17 vs (206-120)=86 => 69 bytes. was 65 bytes prior to PR 209709!
* usually 3 bytes smaller since don't need to restore eflags.
*/
#define TRACE_CTI_MANGLE_SIZE_UPPER_BOUND 72
fragment_t *
build_basic_block_fragment(dcontext_t *dcontext, app_pc start_pc,
uint initial_flags, bool linked, bool visible
_IF_CLIENT(bool for_trace)
_IF_CLIENT(instrlist_t **unmangled_ilist));
void interp(dcontext_t *dcontext);
uint extend_trace(dcontext_t *dcontext, fragment_t *f, linkstub_t *prev_l);
int append_trace_speculate_last_ibl(dcontext_t *dcontext, instrlist_t *trace,
app_pc speculate_next_tag, bool record_translation);
uint
forward_eflags_analysis(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr);
/* Converts instr_t EFLAGS_ flags to corresponding fragment_t FRAG_ flags,
* assuming that the instr_t flags correspond to the start of the fragment_t
*/
uint
instr_eflags_to_fragment_eflags(uint instr_eflags);
instrlist_t *
decode_fragment(dcontext_t *dcontext, fragment_t *f, byte *buf, /*OUT*/uint *bufsz,
uint target_flags, /*OUT*/uint *dir_exits, /*OUT*/uint *indir_exits);
instrlist_t *
decode_fragment_exact(dcontext_t *dcontext, fragment_t *f, byte *buf,
/*IN/OUT*/uint *bufsz, uint target_flags,
/*OUT*/uint *dir_exits, /*OUT*/uint *indir_exits);
fragment_t * copy_fragment(dcontext_t *dcontext, fragment_t *f, bool replace);
void shift_ctis_in_fragment(dcontext_t *dcontext, fragment_t *f, ssize_t shift,
cache_pc start, cache_pc end, size_t old_size);
#ifdef PROFILE_RDTSC
void add_profile_call(dcontext_t *dcontext);
#endif
app_pc emulate(dcontext_t *dcontext, app_pc pc, priv_mcontext_t *mc);
bool instr_is_trace_cmp(dcontext_t *dcontext, instr_t *inst);
typedef struct {
app_pc region_start;
app_pc region_end;
app_pc start_pc;
app_pc min_pc;
app_pc max_pc;
app_pc bb_end;
bool contiguous;
bool overlap;
} overlap_info_t;
instrlist_t * build_app_bb_ilist(dcontext_t *dcontext, byte *start_pc, file_t outf);
void
bb_build_abort(dcontext_t *dcontext, bool clean_vmarea, bool unlock);
bool
expand_should_set_translation(dcontext_t *dcontext);
/* Builds an instrlist_t as though building a bb from pc.
* Use recreate_fragment_ilist() for building an instrlist_t for a fragment.
* If check_vm_area is false, Does NOT call check_thread_vm_area()!
* Make sure you know it will terminate at the right spot. It does
* check selfmod and native_exec for elision, but otherwise will
* follow ubrs to the limit. Currently used for
* record_translation_info() (case 3559).
*/
instrlist_t * recreate_bb_ilist(dcontext_t *dcontext, byte *pc, byte *pretend_pc,
app_pc stop_pc/*optional, only for full_decode*/,
uint flags,
uint *res_flags, uint *res_exit_type,
bool check_vm_area, bool mangle, void **vmlist
_IF_CLIENT(bool call_client)
_IF_CLIENT(bool for_trace));
instrlist_t *
recreate_fragment_ilist(dcontext_t *dcontext, byte *pc,
/*IN/OUT*/fragment_t **f_res, /*OUT*/bool *alloc,
bool mangle _IF_CLIENT(bool call_client));
app_pc find_app_bb_end(dcontext_t *dcontext, byte *start_pc, uint flags);
bool app_bb_overlaps(dcontext_t *dcontext, byte *start_pc, uint flags,
byte *region_start, byte *region_end, overlap_info_t *info_res);
bool
reached_image_entry_yet(void);
void
set_reached_image_entry(void);
/* in decode.c, needed here for ref in arch.h */
/* DR_API EXPORT TOFILE dr_ir_utils.h */
/* DR_API EXPORT BEGIN */
/** Specifies which processor mode to use when decoding or encoding. */
typedef enum _dr_isa_mode_t {
DR_ISA_IA32, /**< IA-32 (Intel/AMD 32-bit mode). */
DR_ISA_X86 = DR_ISA_IA32, /**< Alis for DR_ISA_IA32. */
DR_ISA_AMD64, /**< AMD64 (Intel/AMD 64-bit mode). */
DR_ISA_ARM_THUMB, /**< Thumb (ARM T16 and T32). */
DR_ISA_ARM_A32, /**< ARM A32 (AArch32 ARM). */
DR_ISA_ARM_A64, /**< ARM A64 (AArch64). */
} dr_isa_mode_t;
/* DR_API EXPORT END */
#define DEFAULT_ISA_MODE \
IF_X86_ELSE(IF_X64_ELSE(DR_ISA_AMD64, DR_ISA_IA32), \
IF_X64_ELSE(DR_ISA_ARM_A64, DR_ISA_ARM_A32))
DR_API
/**
* The decode and encode routines use a per-thread persistent flag that
* indicates which processor mode to use. This routine sets that flag to the
* indicated value and optionally returns the old value. Be sure to restore the
* old value prior to any further application execution to avoid problems in
* mis-interpreting application code.
*/
bool
dr_set_isa_mode(dcontext_t *dcontext, dr_isa_mode_t new_mode,
dr_isa_mode_t *old_mode OUT);
DR_API
/**
* The decode and encode routines use a per-thread persistent flag that
* indicates which processor mode to use. This routine returns the value of
* that flag.
*/
dr_isa_mode_t
dr_get_isa_mode(dcontext_t *dcontext);
/* in encode.c */
/* DR_API EXPORT TOFILE dr_ir_instr.h */
DR_API
/** Returns true iff \p instr can be encoding as a valid IA-32 instruction. */
bool
instr_is_encoding_possible(instr_t *instr);
DR_API
/**
* Encodes \p instr into the memory at \p pc.
* Uses the x86/x64 mode stored in instr, not the mode of the current thread.
* Returns the pc after the encoded instr, or NULL if the encoding failed.
* If instr is a cti with an instr_t target, the note fields of instr and
* of the target must be set with the respective offsets of each instr_t!
* (instrlist_encode does this automatically, if the target is in the list).
* x86 instructions can occupy up to 17 bytes, so the caller should ensure
* the target location has enough room to avoid overflow.
*/
byte *
instr_encode(dcontext_t *dcontext, instr_t *instr, byte *pc);
DR_API
/**
* Encodes \p instr into the memory at \p copy_pc in preparation for copying
* to \p final_pc. Any pc-relative component is encoded as though the
* instruction were located at \p final_pc. This allows for direct copying
* of the encoded bytes to \p final_pc without re-relativization.
*
* Uses the x86/x64 mode stored in instr, not the mode of the current thread.
* Returns the pc after the encoded instr, or NULL if the encoding failed.
* If instr is a cti with an instr_t target, the note fields of instr and
* of the target must be set with the respective offsets of each instr_t!
* (instrlist_encode does this automatically, if the target is in the list).
* x86 instructions can occupy up to 17 bytes, so the caller should ensure
* the target location has enough room to avoid overflow.
*/
byte *
instr_encode_to_copy(dcontext_t *dcontext, instr_t *instr, byte *copy_pc,
byte *final_pc);
/* DR_API EXPORT TOFILE dr_ir_instrlist.h */
DR_API
/**
* Encodes each instruction in \p ilist in turn in contiguous memory starting
* at \p pc. Returns the pc after all of the encodings, or NULL if any one
* of the encodings failed.
* Uses the x86/x64 mode stored in each instr, not the mode of the current thread.
* In order for instr_t operands to be encoded properly,
* \p has_instr_jmp_targets must be true. If \p has_instr_jmp_targets is true,
* the note field of each instr_t in ilist will be overwritten, and if any
* instr_t targets are not in \p ilist, they must have their note fields set with
* their offsets relative to pc.
* x86 instructions can occupy up to 17 bytes each, so the caller should ensure
* the target location has enough room to avoid overflow.
*/
byte *
instrlist_encode(dcontext_t *dcontext, instrlist_t *ilist, byte *pc,
bool has_instr_jmp_targets);
DR_API
/**
* Encodes each instruction in \p ilist in turn in contiguous memory
* starting \p copy_pc in preparation for copying to \p final_pc. Any
* pc-relative instruction is encoded as though the instruction list were
* located at \p final_pc. This allows for direct copying of the
* encoded bytes to \p final_pc without re-relativization.
*
* Returns the pc after all of the encodings, or NULL if any one
* of the encodings failed.
*
* Uses the x86/x64 mode stored in each instr, not the mode of the current thread.
*
* In order for instr_t operands to be encoded properly,
* \p has_instr_jmp_targets must be true. If \p has_instr_jmp_targets is true,
* the note field of each instr_t in ilist will be overwritten, and if any
* instr_t targets are not in \p ilist, they must have their note fields set with
* their offsets relative to pc.
*
* If \p max_pc is non-NULL, computes the total size required to encode the
* instruction list before performing any encoding. If the whole list will not
* fit starting at \p copy_pc without exceeding \p max_pc, returns NULL without
* encoding anything. Otherwise encodes as normal. Note that x86 instructions
* can occupy up to 17 bytes each, so if \p max_pc is NULL, the caller should
* ensure the target location has enough room to avoid overflow.
*/
byte *
instrlist_encode_to_copy(dcontext_t *dcontext, instrlist_t *ilist, byte *copy_pc,
byte *final_pc, byte *max_pc, bool has_instr_jmp_targets);
/* in mangle.c */
void insert_clean_call_with_arg_jmp_if_ret_true(dcontext_t *dcontext, instrlist_t *ilist,
instr_t *instr, void *callee, int arg, app_pc jmp_tag, instr_t *jmp_instr);
#ifdef UNIX
void mangle_clone_code(dcontext_t *dcontext, byte *pc, bool skip);
bool mangle_syscall_code(dcontext_t *dcontext, fragment_t *f, byte *pc, bool skip);
#endif
void finalize_selfmod_sandbox(dcontext_t *dcontext, fragment_t *f);
bool instr_check_xsp_mangling(dcontext_t *dcontext, instr_t *inst, int *xsp_adjust);
void
float_pc_update(dcontext_t *dcontext);
/* in retcheck.c */
#ifdef CHECK_RETURNS_SSE2
void finalize_return_check(dcontext_t *dcontext, fragment_t *f);
#endif
#ifdef RETURN_AFTER_CALL
void add_return_target(dcontext_t *dcontext, app_pc instr_pc, instr_t *instr);
int ret_after_call_check(dcontext_t *dcontext, app_pc target_addr, app_pc src_addr);
bool is_observed_call_site(dcontext_t *dcontext, app_pc retaddr);
#endif
/* in optimize.c */
void optimize_trace(dcontext_t *dcontext, app_pc tag, instrlist_t *trace);
#ifdef DEBUG
void print_optimization_stats(void);
#endif
#ifdef SIDELINE
/* exact overlap with sideline.h */
#include "sideline.h"
#endif
#include "../link.h"
/* convert link flags to ibl_branch_type_t */
static inline ibl_branch_type_t
extract_branchtype(ushort linkstub_flags)
{
if (TEST(LINK_RETURN, linkstub_flags))
return IBL_RETURN;
if (EXIT_IS_CALL(linkstub_flags))
return IBL_INDCALL;
if (TEST(LINK_JMP, linkstub_flags)) /* plain JMP or IND_JMP_PLT */
return IBL_INDJMP;
ASSERT_NOT_REACHED();
return IBL_GENERIC;
}
/* convert ibl_branch_type_t to LINK_ flags */
static inline uint
ibltype_to_linktype(ibl_branch_type_t ibltype)
{
if (ibltype == IBL_RETURN)
return LINK_INDIRECT | LINK_RETURN;
if (ibltype == IBL_INDCALL)
return LINK_INDIRECT | LINK_CALL;
if (ibltype == IBL_INDJMP)
return LINK_INDIRECT | LINK_JMP;
ASSERT_NOT_REACHED();
return 0;
}
#ifdef DEBUG
bool is_ibl_routine_type(dcontext_t *dcontext, cache_pc target, ibl_branch_type_t branch_type);
#endif /* DEBUG */
/* This completely optimizable routine is the only place where we
* allow a data pointer to be converted to a function pointer to allow
* better type-checking for the rest of our C code
*/
/* on x86 function pointers and data pointers are interchangeable */
static inline
generic_func_t
convert_data_to_function(void *data_ptr)
{
#ifdef WINDOWS
# pragma warning(push)
# pragma warning(disable : 4055)
#endif
return (generic_func_t)data_ptr;
#ifdef WINDOWS
# pragma warning(pop)
#endif
}
/* Our version of setjmp & longjmp. Currently used only for handling hot patch
* exceptions and to implement an internal generic try-catch mechanism
* later on (case 1800).
* We could use a priv_mcontext_t here, but that has 4 extra fields that aren't
* used here. TODO: we could use it and live with the wastage?
* Espcially in light of the merging from PR 218131.
*
* edx & eax need not be saved because they are scratch registers in a
* call, i.e., caller-save; they are used to return values from functions.
* As longjmp is implemented as return from setjmp, eax & edx need not be saved.
*/
typedef struct dr_jmp_buf_t {
reg_t xbx;
reg_t xcx;
reg_t xdi;
reg_t xsi;
reg_t xbp;
reg_t xsp;
reg_t xip;
#ifdef X64
/* optimization: can we trust callee-saved regs r8,r9,r10,r11 and not save them? */
reg_t r8, r9, r10, r11, r12, r13, r14, r15;
#endif
#if defined(UNIX) && defined(DEBUG)
/* i#226/PR 492568: we avoid the cost of storing this by using the
* mask in the fault's signal frame, but we do record it in debug
* build to verify our assumptions
*/
kernel_sigset_t sigmask;
#endif
} dr_jmp_buf_t;
/* in x86.asm */
int
dr_longjmp(dr_jmp_buf_t *buf, int val);
int
dr_setjmp(dr_jmp_buf_t *buf);
/* Fast asm-based safe read, but requires fault handling to be set up */
bool safe_read_fast(const void *base, size_t size, void *out_buf, size_t *bytes_read);
/* For os-specific fault recovery */
bool is_safe_read_pc(app_pc pc);
app_pc safe_read_resume_pc(void);
#ifdef UNIX
/* i#46: Private string routines for libc isolation. */
# ifdef memcpy
# undef memcpy
# endif
# ifdef memset
# undef memset
# endif
void *memcpy(void *dst, const void *src, size_t n);
void *memset(void *dst, int val, size_t n);
#endif /* UNIX */
#ifdef UNIX
/* Private replacement for _dl_runtime_resolve() for native_exec. */
void *_dynamorio_runtime_resolve(void);
#endif
#define DR_SETJMP(buf) (dr_setjmp(buf))
#define DR_LONGJMP(buf, val) \
do { \
ASSERT(val !=0); \
dr_longjmp(buf, val); \
} while (0)
/* Macros to access application function parameters.
* These assume that we're at function entry, (i.e., mc->xsp points at the
* return address on X86, or mc->sp points at the first on-stack arg on ARM).
* Compare the SYS_PARAM* macros and REGPARM* enum: some duplication there.
*
* Note that, in X64, if a param is 32 bits we must ignore the top 32 bits
* of its stack slot (Since passed via "mov dword" instead of "push", top
* bits are garbage.)
*/
#ifdef X86
# ifdef X64
# ifdef WINDOWS
# define APP_PARAM_0(mc) (mc)->xcx
# define APP_PARAM_1(mc) (mc)->xdx
# define APP_PARAM_2(mc) (mc)->r8
# define APP_PARAM_3(mc) (mc)->r9
# define APP_PARAM_4(mc) (*(((reg_t *)((mc)->xsp)) + 5))
# define APP_PARAM_5(mc) (*(((reg_t *)((mc)->xsp)) + 6))
# define APP_PARAM_6(mc) (*(((reg_t *)((mc)->xsp)) + 7))
# define APP_PARAM_7(mc) (*(((reg_t *)((mc)->xsp)) + 8))
# define APP_PARAM_8(mc) (*(((reg_t *)((mc)->xsp)) + 9))
# define APP_PARAM_9(mc) (*(((reg_t *)((mc)->xsp)) + 10))
# define APP_PARAM_10(mc) (*(((reg_t *)((mc)->xsp)) + 11))
# else
# define APP_PARAM_0(mc) (mc)->xdi
# define APP_PARAM_1(mc) (mc)->xsi
# define APP_PARAM_2(mc) (mc)->rdx
# define APP_PARAM_3(mc) (mc)->rcx
# define APP_PARAM_4(mc) (mc)->r8
# define APP_PARAM_5(mc) (mc)->r9
# define APP_PARAM_6(mc) (*(((reg_t *)((mc)->xsp)) + 1))
# define APP_PARAM_7(mc) (*(((reg_t *)((mc)->xsp)) + 2))
# define APP_PARAM_8(mc) (*(((reg_t *)((mc)->xsp)) + 3))
# define APP_PARAM_9(mc) (*(((reg_t *)((mc)->xsp)) + 4))
# define APP_PARAM_10(mc) (*(((reg_t *)((mc)->xsp)) + 5))
# endif /* Win/Unix */
/* only takes integer literals */
# define APP_PARAM(mc, offs) APP_PARAM_##offs(mc)
# else /* 32-bit */
/* only takes integer literals */
# define APP_PARAM(mc, offs) (*(((reg_t *)((mc)->xsp)) + (offs) + 1))
# endif /* 64/32-bit */
#elif defined(ARM)
# ifdef UNIX
# define APP_PARAM_0(mc) (mc)->r0
# define APP_PARAM_1(mc) (mc)->r1
# define APP_PARAM_2(mc) (mc)->r2
# define APP_PARAM_3(mc) (mc)->r3
# ifdef X64
# define APP_PARAM_4(mc) (mc)->r4
# define APP_PARAM_5(mc) (mc)->r5
# define APP_PARAM_6(mc) (mc)->r6
# define APP_PARAM_7(mc) (mc)->r7
# define APP_PARAM_8(mc) (*(((reg_t *)((mc)->xsp)) + 0))
# define APP_PARAM_9(mc) (*(((reg_t *)((mc)->xsp)) + 1))
# define APP_PARAM_10(mc) (*(((reg_t *)((mc)->xsp)) + 2))
# else
# define APP_PARAM_4(mc) (*(((reg_t *)((mc)->xsp)) + 0))
# define APP_PARAM_5(mc) (*(((reg_t *)((mc)->xsp)) + 1))
# define APP_PARAM_6(mc) (*(((reg_t *)((mc)->xsp)) + 2))
# define APP_PARAM_7(mc) (*(((reg_t *)((mc)->xsp)) + 3))
# define APP_PARAM_8(mc) (*(((reg_t *)((mc)->xsp)) + 4))
# define APP_PARAM_9(mc) (*(((reg_t *)((mc)->xsp)) + 5))
# define APP_PARAM_10(mc) (*(((reg_t *)((mc)->xsp)) + 6))
# endif /* 64/32-bit */
# else /* Windows */
# error Windows is not supported
# endif /* UNIX/Win */
# define APP_PARAM(mc, offs) APP_PARAM_##offs(mc)
#endif /* X86/ARM */
#define MCXT_SYSNUM_REG(mc) ((mc)->IF_X86_ELSE(xax, r7))
#define MCXT_FIRST_REG_FIELD(mc) ((mc)->IF_X86_ELSE(xdi, r0))
static inline
reg_t
get_mcontext_frame_ptr(dcontext_t *dcontext, priv_mcontext_t *mc)
{
reg_t reg;
switch (dr_get_isa_mode(dcontext)) {
#ifdef X86
case DR_ISA_IA32:
case DR_ISA_AMD64:
reg = mc->xbp;
break;
#elif defined(ARM)
# ifdef X64
case DR_ISA_ARM_A64:
reg = mc->r29;
break;
# else
case DR_ISA_ARM_THUMB:
reg = mc->r7;
break;
case DR_ISA_ARM_A32:
reg = mc->r11;
break;
# endif /* 64/32-bit */
#endif /* X86/ARM */
default:
ASSERT_NOT_REACHED();
reg = 0;
}
return reg;
}
/* FIXME: check on all platforms: these are for Fedora 8 and XP SP2
* Keep in synch w/ defines in x86.asm
*/
#define CS32_SELECTOR 0x23
#define CS64_SELECTOR 0x33
#endif /* _ARCH_EXPORTS_H_ */