blob: 4679cd09f8c13c00ce3bb96a2211790516b7ba3b [file] [log] [blame]
/* **********************************************************
* Copyright (c) 2010-2014 Google, Inc. All rights reserved.
* Copyright (c) 2000-2010 VMware, Inc. All rights reserved.
* **********************************************************/
/*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of VMware, Inc. nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
/* Copyright (c) 2003-2007 Determina Corp. */
/* Copyright (c) 2001-2003 Massachusetts Institute of Technology */
/* Copyright (c) 2000-2001 Hewlett-Packard Company */
/* file "emit_utils_shared.c" */
/* The Pentium processors maintain cache consistency in hardware, so we don't
* worry about getting stale cache entries.
*/
/* FIXME i#1551: flush code cache after update it on ARM because the hardware
* does not maintain cache consistency in hardware.
*/
#include "../globals.h"
#include "../link.h"
#include "../fragment.h"
#include "../fcache.h"
#include "../emit.h"
#include "arch.h"
#include "instr.h"
#include "instr_create.h"
#include "instrlist.h"
#include "instrument.h" /* for dr_insert_call() */
#include "proc.h"
#include <string.h> /* for memcpy */
#include "decode.h"
#include "decode_fast.h"
#include "x86/decode_private.h"
#ifdef DEBUG
# include "disassemble.h"
#endif
#include <limits.h> /* for UCHAR_MAX */
#include "../perscache.h"
#ifdef VMX86_SERVER
# include "vmkuw.h"
#endif
/* fragment_t fields */
/* CAUTION: if TAG_OFFS changes from 0, must change indirect exit stub! */
#define FRAGMENT_START_PC_OFFS (offsetof(fragment_t, start_pc))
#define FRAGMENT_COUNTER_OFFS (offsetof(fragment_t, hot_counter))
#define FRAGMENT_PREFIX_SIZE_OFFS (offsetof(fragment_t, prefix_size))
#ifdef TRACE_HEAD_CACHE_INCR
/* linkstub_t field */
# define LINKSTUB_TARGET_FRAG_OFFS (offsetof(direct_linkstub_t, target_fragment))
#endif
#ifdef PROFILE_LINKCOUNT
# define LINKSTUB_COUNT_OFFS (offsetof(linkstub_t, count))
#endif
/* N.B.: I decided to not keep supporting DCONTEXT_IN_EDI
* If we really want it later we can add it, it's a pain to keep
* maintaining it with every change here
*/
#ifdef DCONTEXT_IN_EDI
# error DCONTEXT_IN_EDI Not Implemented
#endif
/* make code more readable by shortening long lines
* we mark all as meta to avoid client interface asserts
*/
#define POST instrlist_meta_postinsert
#define PRE instrlist_meta_preinsert
#define APP instrlist_meta_append
/**
** CAUTION!
**
** The following definitions and routines are highly dependent upon
** definitions made in x86.asm. Do NOT change any constants or code
** without first consulting that file.
**
**/
/***************************************************************************
***************************************************************************
** EXIT STUB
**
** WARNING: all exit stubs must support atomic linking and unlinking,
** meaning a link/unlink operation must involve a single store!
** There is an exception: a first-time link (detected using a sentinel
** LINKCOUNT_NEVER_LINKED_SENTINEL placed where the unlinked entry
** code will go once linked) does not need to be atomic.
**/
/* FIXME i#1551: update the comment to x86/arm in this file */
/*
direct branch exit_stub:
5x8 mov %xax, xax_offs(&dcontext) or tls
#if defined(PROFILE_LINKCOUNT) (PR 248210: x64 not supported)
| 1 lahf
| 3 seto %al
|#if !defined(LINKCOUNT_64_BITS)
| 6 inc l->count
|#else
| 7 add $1,l->count
| 7 adc $0,l->count+4
|#endif
| 2 add $0x7f,%al
| 1 sahf
#endif
5x10 mov &linkstub, %xax
5 jmp target addr
#if defined(PROFILE_LINKCOUNT) (PR 248210: x64 not supported)
|unlinked entry point:
| 5 movl %eax, eax_offs(&dcontext)
| 5 movl &linkstub, %eax
| 5 jmp fcache_return
|
| Notes: we link/unlink by modifying the 1st jmp to either target unlinked
| entry point or the target fragment. When we link for the first time
| we try to remove the eflags save/restore, shifting the 1st jmp up (the
| space between it and unlinked entry just becomes junk).
#endif
indirect branch exit_stub (only used if -indirect_stubs):
6x9 mov %xbx, xbx_offs(&dcontext) or tls
5x11 mov &linkstub, %xbx
5 jmp indirect_branch_lookup
indirect branches use xbx so that the flags can be saved into xax using
the lahf instruction!
xref PR 249775 on lahf support on x64.
for PROFILE_LINKCOUNT, the count increment is performed inside the
hashtable lookup (in both linked and unlinked paths) both since the flags
are saved there for the linked path and to save space in stubs
also see emit_inline_ibl_stub() below
*/
/* DIRECT_EXIT_STUB_SIZE is in arch_exports.h */
#define STUB_DIRECT_SIZE(flags) DIRECT_EXIT_STUB_SIZE(flags)
/* for -thread_private, we're relying on the fact that
* SIZE32_MOV_XBX_TO_TLS == SIZE32_MOV_XBX_TO_ABS, and that
* x64 always uses tls
*/
#define STUB_INDIRECT_SIZE32 \
(SIZE32_MOV_XBX_TO_TLS + SIZE32_MOV_PTR_IMM_TO_XAX + JMP_LONG_LENGTH)
#define STUB_INDIRECT_SIZE64 \
(SIZE64_MOV_XBX_TO_TLS + SIZE64_MOV_PTR_IMM_TO_XAX + JMP_LONG_LENGTH)
#define STUB_INDIRECT_SIZE(flags) \
(FRAG_IS_32(flags) ? STUB_INDIRECT_SIZE32 : STUB_INDIRECT_SIZE64)
/* STUB_COARSE_DIRECT_SIZE is in arch_exports.h */
#define STUB_COARSE_INDIRECT_SIZE(flags) (STUB_INDIRECT_SIZE(flags))
#ifndef LINKCOUNT_64_BITS
# define LINKCOUNT_INCSIZE (6)
#else
# define LINKCOUNT_INCSIZE (7+7)
#endif
#define LINKCOUNT_EFLAGS_SAVE (3+1)
#define LINKCOUNT_EFLAGS_RESTORE (2+1)
#define LINKCOUNT_FLAGSIZE (LINKCOUNT_EFLAGS_SAVE + LINKCOUNT_EFLAGS_RESTORE)
#define LINKCOUNT_DIRECT_EXTRA(flags) \
(LINKCOUNT_INCSIZE + LINKCOUNT_FLAGSIZE + STUB_DIRECT_SIZE(flags))
#define LINKCOUNT_UNLINKED_ENTRY(flags) \
(LINKCOUNT_INCSIZE + LINKCOUNT_FLAGSIZE + STUB_DIRECT_SIZE(flags))
/* used to distinguish a never-linked direct exit -- once linked this
* will be replaced by the beginning of the unlink entry point, which is
* a save of xax, which will never look like this. we choose nops to
* avoid complicating our disassembly routines.
*/
#define LINKCOUNT_NEVER_LINKED_SENTINEL 0x90909090
/* Return size in bytes required for an exit stub with specified
* target and FRAG_ flags
*/
int
exit_stub_size(dcontext_t *dcontext, cache_pc target, uint flags)
{
if (TEST(FRAG_COARSE_GRAIN, flags)) {
/* For coarse: bb building points at bb ibl, and then insert_exit_stub
* changes that to the appropriate coarse prefix. So the emit() calls to
* this routine pass in a real ibl. But any later calls, e.g. for
* disassembly, that ask linkstub_size() will call EXIT_TARGET_TAG() which
* calls indirect_linkstub_target() which returns get_coarse_ibl_prefix():
* which then is not recognized as indirect by this routine!
* Note that coarse_indirect_stub_jmp_target() derefs the prefix:
* should we require callers who have stub pc to call that instead of us
* de-referencing?
*/
target = coarse_deref_ibl_prefix(dcontext, target);
}
if (is_indirect_branch_lookup_routine(dcontext, target)) {
/* indirect branch */
/* FIXME: Since we don't have the stub flags we'll lookup the
* target routine's template in a very roundabout fashion here
* by dispatching on the ibl_routine entry point
*/
ibl_code_t *ibl_code;
ibl_type_t ibl_type;
IF_X64(gencode_mode_t mode;)
DEBUG_DECLARE(bool is_ibl = )
get_ibl_routine_type_ex(dcontext, target, &ibl_type _IF_X64(&mode));
ASSERT(is_ibl);
IF_X64(ASSERT(mode == FRAGMENT_GENCODE_MODE(flags) ||
(DYNAMO_OPTION(x86_to_x64) && mode == GENCODE_X86_TO_X64)));
ibl_code = get_ibl_routine_code_ex(dcontext, ibl_type.branch_type, flags
_IF_X64(mode));
if (!EXIT_HAS_STUB(ibltype_to_linktype(ibl_code->branch_type),
IBL_FRAG_FLAGS(ibl_code)))
return 0;
if (TEST(FRAG_COARSE_GRAIN, flags)) {
IF_WINDOWS(ASSERT(!is_shared_syscall_routine(dcontext, target)));
/* keep in synch w/ coarse_indirect_stub_size() */
return (STUB_COARSE_INDIRECT_SIZE(flags));
}
#ifdef WINDOWS
if (is_shared_syscall_routine(dcontext, target)) {
return INTERNAL_OPTION(shared_syscalls_fastpath) ? 5 :
STUB_INDIRECT_SIZE(flags);
}
#endif
if (ibl_code->ibl_head_is_inlined)
return ibl_code->inline_stub_length;
else
return (STUB_INDIRECT_SIZE(flags));
} else {
/* direct branch */
if (TEST(FRAG_COARSE_GRAIN, flags))
return (STUB_COARSE_DIRECT_SIZE(flags));
#ifdef PROFILE_LINKCOUNT
if (dynamo_options.profile_counts && (flags & FRAG_IS_TRACE) != 0)
return (STUB_DIRECT_SIZE(flags) + LINKCOUNT_DIRECT_EXTRA(flags));
else {
#endif
return (STUB_DIRECT_SIZE(flags));
#ifdef PROFILE_LINKCOUNT
}
#endif
}
}
static bool
is_patchable_exit_stub_helper(dcontext_t *dcontext, cache_pc ltarget,
ushort lflags, uint fflags)
{
if (LINKSTUB_INDIRECT(lflags)) {
/*indirect */
if (!DYNAMO_OPTION(indirect_stubs))
return false;
if (
#ifdef WINDOWS
!is_shared_syscall_routine(dcontext, ltarget) &&
#endif
get_ibl_routine_code(dcontext, extract_branchtype(lflags), fflags)
->ibl_head_is_inlined) {
return !DYNAMO_OPTION(atomic_inlined_linking);
} else {
return true;
}
} else {
/* direct */
ASSERT(LINKSTUB_DIRECT(lflags));
#if defined(PROFILE_LINKCOUNT) || defined(TRACE_HEAD_CACHE_INCR)
return true;
#else
return false;
#endif
}
}
bool
is_patchable_exit_stub(dcontext_t *dcontext, linkstub_t *l, fragment_t *f)
{
return is_patchable_exit_stub_helper(dcontext, EXIT_TARGET_TAG(dcontext, f, l),
l->flags, f->flags);
}
bool
is_exit_cti_stub_patchable(dcontext_t *dcontext, instr_t *inst, uint frag_flags)
{
app_pc target;
/* we figure out what the linkstub flags should be
* N.B.: we have to be careful to match the LINKSTUB_ macros
*/
ushort lflags = (ushort) instr_exit_branch_type(inst);
ASSERT_TRUNCATE(lflags, ushort, instr_exit_branch_type(inst));
ASSERT(instr_is_exit_cti(inst));
target = instr_get_branch_target_pc(inst);
if (is_indirect_branch_lookup_routine(dcontext, target)) {
lflags |= LINK_INDIRECT;
} else {
lflags |= LINK_DIRECT;
}
return is_patchable_exit_stub_helper(dcontext, target, lflags, frag_flags);
}
uint
bytes_for_exitstub_alignment(dcontext_t *dcontext, linkstub_t *l,
fragment_t *f, byte *startpc)
{
if (is_patchable_exit_stub(dcontext, l, f)) {
/* assumption - we only hot patch the ending jmp of the exit stub
* (and that exit stub size returns the right values) */
ptr_uint_t shift = ALIGN_SHIFT_SIZE
(startpc +
exit_stub_size(dcontext, EXIT_TARGET_TAG(dcontext, f, l), f->flags) -
EXIT_STUB_PATCH_OFFSET,
EXIT_STUB_PATCH_SIZE, PAD_JMPS_ALIGNMENT);
#ifdef PROFILE_LINKCOUNT
/* assumption doesn't hold because of the optimize ... */
/* FIXME : once this is implemented re-enable the ifdefed out stats
* in emit_fragment_common */
ASSERT_NOT_IMPLEMENTED(false);
#endif
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint(shift)));
return (uint) shift;
}
return 0;
}
/* Returns an upper bound on the number of bytes that will be needed to add
* this fragment to a trace */
uint
extend_trace_pad_bytes(fragment_t *add_frag)
{
/* FIXME : this is a poor estimate, we could do better by looking at the
* linkstubs and checking if we are inlining ibl, but since this is just
* used by monitor.c for a max size check should be fine to overestimate
* we'll just end up with slightly shorter max size traces */
/* we don't trace through traces in normal builds, so don't worry about
* number of exits (FIXME this also assumes bbs don't trace through
* conditional or indirect branches) */
ASSERT_NOT_IMPLEMENTED(!TEST(FRAG_IS_TRACE, add_frag->flags));
/* Also, if -pad_jmps_shift_bb we assume that we don't need to remove
* any nops from fragments added to traces since there shouldn't be any if
* we only add bbs (nop_pad_ilist has an assert that verifies we don't add
* any nops to bbs when -pad_jmps_shift_bb without marking as CANNOT_BE_TRACE,
* so here we also verify that we only add bbs) - Xref PR 215179, UNIX syscall
* fence exits and CLIENT_INTERFACE added/moved exits can lead to bbs with
* additional hot_patchable locations. We mark such bb fragments as CANNOT_BE_TRACE
* in nop_pad_ilist() if -pad_jmps_mark_no_trace is set or assert otherwise to avoid
* various difficulties so should not see them here. */
/* A standard bb has at most 2 patchable locations (ends in conditional or ends
* in indirect that is promoted to inlined). */
return 2*MAX_PAD_SIZE;
}
/* return startpc shifted by the necessary bytes to pad patchable jmps of the
* exit stub to proper alignment */
byte *
pad_for_exitstub_alignment(dcontext_t *dcontext, linkstub_t *l,
fragment_t *f, byte *startpc)
{
uint shift;
ASSERT(PAD_FRAGMENT_JMPS(f->flags)); /* shouldn't call this otherwise */
shift = bytes_for_exitstub_alignment(dcontext, l, f, startpc);
if (shift > 0) {
/* Pad with 1 byte instructions so looks nice in debuggers.
* decode_fragment also checks for this as a sanity check. Note,
* while these instructions can never be reached, they will be decoded
* by shift fcache pointers so must put something valid here. */
SET_TO_DEBUG(startpc, shift);
startpc += shift;
STATS_PAD_JMPS_ADD(f->flags, num_shifted_stubs, 1);
STATS_PAD_JMPS_ADD(f->flags, shifted_stub_bytes, shift);
} else {
STATS_PAD_JMPS_ADD(f->flags, num_stubs_no_shift, 1);
}
return startpc;
}
/* Only used if -no_pad_jmps_shift_{bb,trace}. FIXME this routine is expensive (the
* instr_expand) and we may end up removing app nops (an optimizations but
* not really what we're after here). */
void
remove_nops_from_ilist(dcontext_t *dcontext, instrlist_t *ilist
_IF_DEBUG(bool recreating))
{
instr_t *inst, *next_inst;
for (inst = instrlist_first(ilist); inst != NULL; inst = next_inst) {
/* FIXME : expensive, just expand instr before cti, function not used
* if -no_pad_jmps_shift_{bb,trace} */
inst = instr_expand(dcontext, ilist, inst);
next_inst = instr_get_next(inst);
if (instr_is_nop(inst)) {
instrlist_remove(ilist, inst);
DOSTATS({
if (!recreating) {
STATS_INC(num_nops_removed);
STATS_ADD(num_nop_bytes_removed, instr_length(dcontext, inst));
}
});
instr_destroy(dcontext, inst);
}
}
}
cache_pc
get_direct_exit_target(dcontext_t *dcontext, uint flags)
{
if (FRAG_DB_SHARED(flags)) {
if (TEST(FRAG_COARSE_GRAIN, flags)) {
/* note that entrance stubs should target their unit's prefix,
* who will then target this routine
*/
return fcache_return_coarse_routine(IF_X64(FRAGMENT_GENCODE_MODE(flags)));
} else
return fcache_return_shared_routine(IF_X64(FRAGMENT_GENCODE_MODE(flags)));
} else {
return fcache_return_routine_ex(dcontext _IF_X64(FRAGMENT_GENCODE_MODE(flags)));
}
}
int
insert_exit_stub(dcontext_t *dcontext, fragment_t *f,
linkstub_t *l, cache_pc stub_pc)
{
return insert_exit_stub_other_flags(dcontext, f, l, stub_pc, l->flags);
}
/* Patch the (direct) branch at branch_pc so it branches to target_pc
* The write that actually patches the branch is done atomically so this
* function is safe with respect to a thread executing this branch presuming
* that both the before and after targets are valid and that [pc, pc+4) does
* not cross a cache line.
*/
void
patch_branch(cache_pc branch_pc, cache_pc target_pc, bool hot_patch)
{
cache_pc byte_ptr = exit_cti_disp_pc(branch_pc);
insert_relative_target(byte_ptr, target_pc, hot_patch);
}
#ifdef PROFILE_LINKCOUNT
static byte *
change_linkcount_target(byte *pc, app_pc target)
{
/* Once we've linked once, we modify the jmp at the end of the
* link code in the stub to either jmp to the unlinked entry
* (which has no counter inc code of its own, that's why the exit
* jmp doesn't go straight there) or to the target.
* To find the jmp, watch first opcode to determine which state
* stub is in (depending on whether had to save eflags or not).
*/
if (*pc == 0xff || *pc == 0x83) { /* inc/add is 1st instr */
pc += LINKCOUNT_INCSIZE + 1;
} else {
IF_X64(ASSERT_NOT_IMPLEMENTED(false)); /* need to pass in flags */
pc += LINKCOUNT_INCSIZE + LINKCOUNT_FLAGSIZE + STUB_DIRECT_SIZE(FRAG_32_BIT) - 4;
}
pc = insert_relative_target(pc, target, HOT_PATCHABLE);
return pc;
}
static void
optimize_linkcount_stub(dcontext_t *dcontext, fragment_t *f,
linkstub_t *l, fragment_t *targetf)
{
/* first-time link: try to remove eflags save/restore */
# ifdef CUSTOM_EXIT_STUBS
byte *stub_pc = (byte *) EXIT_FIXED_STUB_PC(dcontext, f, l);
# else
byte *stub_pc = (byte *) EXIT_STUB_PC(dcontext, f, l);
# endif
byte *pc = stub_pc;
bool remove_eflags_save = false;
ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
ASSERT(LINKSTUB_DIRECT(l->flags));
if (!INTERNAL_OPTION(unsafe_ignore_eflags_prefix)) {
remove_eflags_save = TEST(FRAG_WRITES_EFLAGS_6, targetf->flags);
}
else {
/* scan through code at target fragment, stop scanning at 1st branch */
uint eflags = 0;
cache_pc end_pc = EXIT_CTI_PC(f, FRAGMENT_EXIT_STUBS(targetf));
byte *fpc = (byte *) FCACHE_ENTRY_PC(targetf);
/* for simplicity, stop at first instr that touches the flags */
while (eflags == 0 && fpc != NULL && ((cache_pc)fpc) < end_pc) {
fpc = decode_eflags_usage(dcontext, fpc, &eflags);
}
remove_eflags_save =
(eflags & (EFLAGS_WRITE_6|EFLAGS_READ_6)) == EFLAGS_WRITE_6;
}
if (remove_eflags_save) {
/* the 6 flags modified by add and adc are written before
* they're read -> don't need to save eflags!
*
* I tried replacing lahf & sahf w/ nops, it's noticeably
* faster to not have the nops, so redo the increment:
*/
pc = insert_linkcount_inc(pc, l);
pc = insert_relative_jump(pc, FCACHE_ENTRY_PC(targetf),
NOT_HOT_PATCHABLE);
/* Fill out with nops till the unlinked entry point so disassembles
* nicely for logfile (we're profile linkcount so presumably going
* to dump this). */
while (pc < (stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags))) {
*pc = 0x90; pc++; /* nop */
}
} else {
/* keep eflags save & restore -- need to keep save of eax
* so skip all that now, go to right before store of &l into eax
*/
pc += LINKCOUNT_DIRECT_EXTRA(f->flags) - 5 - 5;
/* need to insert a restore of eax -- luckily it perfectly
* overwrites the store of &l into eax, FIXME - dangerous
* though, if we ever drop the addr16 flag on a shared restore the
* instruction will be 6 bytes and our hardcoded 5 above will
* lead to a crash (should trigger assert below at least).
*/
pc = insert_restore_xax(dcontext, pc, f->flags, FRAG_DB_SHARED(f->flags),
DIRECT_STUB_SPILL_SLOT, true);
ASSERT(pc == stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags) - 5);
/* now add jmp */
pc = insert_relative_jump(pc, FCACHE_ENTRY_PC(targetf),
NOT_HOT_PATCHABLE);
}
/* we need to replace our never-linked sentinel w/ the real
* unlinked entry point.
*/
ASSERT(pc == stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags));
pc = stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags);
ASSERT(*((uint *)pc) == LINKCOUNT_NEVER_LINKED_SENTINEL);
pc = insert_save_xax(dcontext, pc, f->flags, FRAG_DB_SHARED(f->flags),
DIRECT_STUB_SPILL_SLOT, true);
/* mov $linkstub_ptr,%xax */
*pc = MOV_IMM2XAX_OPCODE; pc++;
IF_X64(ASSERT_NOT_IMPLEMENTED(false));
*((uint *)pc) = (uint)l; pc += 4;
/* jmp to target */
pc = insert_relative_jump(pc, get_direct_exit_target(dcontext, f->flags),
NOT_HOT_PATCHABLE);
}
#endif /* PROFILE_LINKCOUNT */
/* Checks patchable exit cti for proper alignment for patching. If it's
* properly aligned returns 0, else returns the number of bytes it would
* need to be forward shifted to be properly aligned */
uint
patchable_exit_cti_align_offs(dcontext_t *dcontext, instr_t *inst, cache_pc pc)
{
/* all our exit cti's currently use 4 byte offsets */
/* FIXME : would be better to use a instr_is_cti_long or some such
* also should check for addr16 flag (we shouldn't have any prefixes) */
ASSERT((instr_is_cti(inst) && !instr_is_cti_short(inst) &&
!TESTANY(~(PREFIX_JCC_TAKEN|PREFIX_JCC_NOT_TAKEN), instr_get_prefixes(inst)))
|| instr_is_cti_short_rewrite(inst, NULL));
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint
(ALIGN_SHIFT_SIZE(pc + instr_length(dcontext, inst) - CTI_PATCH_SIZE,
CTI_PATCH_SIZE, PAD_JMPS_ALIGNMENT))));
return (uint) ALIGN_SHIFT_SIZE(pc + instr_length(dcontext, inst) - CTI_PATCH_SIZE,
CTI_PATCH_SIZE, PAD_JMPS_ALIGNMENT);
}
/* Returns true if the exit cti is ever dynamically modified */
bool
is_exit_cti_patchable(dcontext_t *dcontext, instr_t *inst, uint frag_flags)
{
app_pc target;
if (TEST(FRAG_COARSE_GRAIN, frag_flags)) {
/* Case 8647: coarse grain fragment bodies always link through stubs
* until frozen, so their ctis are never patched except at freeze time
* when we suspend the world.
*/
ASSERT(!TEST(FRAG_IS_TRACE, frag_flags));
return false;
}
ASSERT(instr_is_exit_cti(inst));
target = instr_get_branch_target_pc(inst);
if (is_indirect_branch_lookup_routine(dcontext, target)) {
/* whether has an inline stub or not, cti is always
* patched if -no_indirect_stubs
*/
if (!DYNAMO_OPTION(indirect_stubs))
return true;
#ifdef WINDOWS
if (target != shared_syscall_routine(dcontext)) {
#endif
return get_ibl_routine_code(dcontext,
extract_branchtype((ushort)instr_exit_branch_type(inst)),
frag_flags)->ibl_head_is_inlined;
#ifdef WINDOWS
}
return false;
#endif
} else {
/* direct exit */
#ifdef PROFILE_LINKCOUNT
if (DYNAMO_OPTION(profile_counts) && TEST(FRAG_IS_TRACE, frag_flags)) {
# ifdef CUSTOM_EXIT_STUBS
return true;
# else
return false;
# endif
}
#endif
if (instr_branch_special_exit(inst))
return false;
return true;
}
}
/* returns true if exit cti no longer points at stub
* (certain situations, like profiling or TRACE_HEAD_CACHE_INCR, go
* through the stub even when linked)
*/
bool
link_direct_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l, fragment_t *targetf,
bool hot_patch)
{
#if defined(PROFILE_LINKCOUNT) || defined(TRACE_HEAD_CACHE_INCR)
# ifdef CUSTOM_EXIT_STUBS
byte *stub_pc = (byte *) (EXIT_FIXED_STUB_PC(dcontext, f, l));
# else
byte *stub_pc = (byte *) (EXIT_STUB_PC(dcontext, f, l));
# endif
#endif
ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
ASSERT(LINKSTUB_DIRECT(l->flags));
STATS_INC(num_direct_links);
#ifdef PROFILE_LINKCOUNT
if (dynamo_options.profile_counts && TEST(FRAG_IS_TRACE, f->flags)) {
/* do not change the exit jmp, instead change the stub itself */
if (*((uint *)(stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags))) ==
LINKCOUNT_NEVER_LINKED_SENTINEL) {
/* this is not atomic, but that's ok, it's first-time only */
/* FIXME - this assumption is so not safe with shared cache
* since we add to table and link incoming before linking outgoing
*/
optimize_linkcount_stub(dcontext, f, l, targetf);
# ifdef CUSTOM_EXIT_STUBS
/* FIXME: want flag that says whether should go through custom
* only when unlinked, or always!
* For now we assume only when unlinked:
*/
/* skip custom code */
patch_branch(EXIT_CTI_PC(f, l), stub_pc,
TEST(FRAG_SHARED, f->flags) ? hot_patch : NOT_HOT_PATCHABLE);
# endif
} else {
# ifdef CUSTOM_EXIT_STUBS
/* FIXME: want flag that says whether should go through custom
* only when unlinked, or always!
* For now we assume only when unlinked:
*/
/* skip custom code */
patch_branch(EXIT_CTI_PC(f, l), stub_pc, hot_patch);
# endif
change_linkcount_target(stub_pc, FCACHE_ENTRY_PC(targetf));
}
# ifdef TRACE_HEAD_CACHE_INCR
/* yes, we wait for linkcount to do its thing and then we change it --
* but to make it more efficient will make this already ungainly
* code even harder to read
*/
/* FIXME - atomicity issues? */
if ((targetf->flags & FRAG_IS_TRACE_HEAD) != 0) {
/* after optimized inc, jmp to unlinked code, but change its final
* jmp to go to incr routine
*/
change_linkcount_target(stub_pc, stub_pc + LINKCOUNT_UNLINKED_ENTRY(f->flags));
LOG(THREAD, LOG_LINKS, 4,
"\tlinking F%d."PFX" to incr routine b/c F%d is trace head\n",
f->id, EXIT_CTI_PC(f, l), targetf->id);
patch_branch(stub_pc + LINKCOUNT_UNLINKED_ENTRY(f->flags) + 10,
trace_head_incr_routine(dcontext), hot_patch);
}
# endif
return false; /* going through stub */
}
#endif /* PROFILE_LINKCOUNT */
#ifdef TRACE_HEAD_CACHE_INCR
if ((targetf->flags & FRAG_IS_TRACE_HEAD) != 0) {
LOG(THREAD, LOG_LINKS, 4,
"\tlinking F%d."PFX" to incr routine b/c F%d is trace head\n",
f->id, EXIT_CTI_PC(f, l), targetf->id);
/* FIXME: more efficient way than multiple calls to get size-5? */
ASSERT(linkstub_size(dcontext, f, l) == DIRECT_EXIT_STUB_SIZE(f->flags));
patch_branch(stub_pc + DIRECT_EXIT_STUB_SIZE(f->flags) - 5,
trace_head_incr_routine(dcontext), hot_patch);
return false; /* going through stub */
}
#endif
/* change jmp target to point to the passed-in target */
#ifdef UNSUPPORTED_API
if ((l->flags & LINK_TARGET_PREFIX) != 0) {
/* want to target just the xcx restore, not the eflags restore
* (only ibl targets eflags restore)
*/
patch_branch(EXIT_CTI_PC(f, l), FCACHE_PREFIX_ENTRY_PC(targetf),
hot_patch);
} else
#endif
patch_branch(EXIT_CTI_PC(f, l), FCACHE_ENTRY_PC(targetf), hot_patch);
return true; /* do not need stub anymore */
}
void
unlink_direct_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
cache_pc stub_pc = (cache_pc) EXIT_STUB_PC(dcontext, f, l);
#ifdef TRACE_HEAD_CACHE_INCR
direct_linkstub_t *dl = (direct_linkstub_t *) l;
#endif
ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
ASSERT(LINKSTUB_DIRECT(l->flags));
#ifdef PROFILE_LINKCOUNT
if (dynamo_options.profile_counts && TEST(FRAG_IS_TRACE, f->flags)) {
byte *pc;
if (*((uint *)(stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags))) ==
LINKCOUNT_NEVER_LINKED_SENTINEL) {
/* never been linked, don't go pointing at the uninitialized
* unlink entry point -- just return, initial state is fine
*/
return;
}
# ifdef CUSTOM_EXIT_STUBS
pc = (byte *) (EXIT_FIXED_STUB_PC(dcontext, f, l));
stub_pc = (cache_pc) pc;
/* FIXME: want flag that says whether should go through custom
* only when unlinked, or always! Also is racy with 2nd branch patch.
* For now we assume only when unlinked.
*/
/* go through custom code again */
patch_branch(EXIT_CTI_PC(f, l), stub_pc, HOT_PATCHABLE);
# else
pc = (byte *) stub_pc;
# endif
# ifdef TRACE_HEAD_CACHE_INCR
if (dl->target_fragment != NULL) { /* HACK to tell if targeted trace head */
/* make unlinked jmp go back to fcache_return */
patch_branch(pc + LINKCOUNT_UNLINKED_ENTRY(f->flags) + 10,
get_direct_exit_target(dcontext, f->flags),
HOT_PATCHABLE);
} else
# endif
/* make jmp after incr go to unlinked entry */
change_linkcount_target(pc, stub_pc + LINKCOUNT_UNLINKED_ENTRY(f->flags));
return;
}
#endif
#ifdef TRACE_HEAD_CACHE_INCR
if (dl->target_fragment != NULL) { /* HACK to tell if targeted trace head */
# ifdef CUSTOM_EXIT_STUBS
byte *pc = (byte *) (EXIT_FIXED_STUB_PC(dcontext, f, l));
# else
byte *pc = (byte *) (EXIT_STUB_PC(dcontext, f, l));
# endif
/* FIXME: more efficient way than multiple calls to get size-5? */
ASSERT(linkstub_size(dcontext, f, l) == DIRECT_EXIT_STUB_SIZE(f->flags));
patch_branch(pc + DIRECT_EXIT_STUB_SIZE(f->flags) - 5,
get_direct_exit_target(dcontext, f->flags),
HOT_PATCHABLE);
}
#endif
/* change jmp target to point to top of exit stub */
patch_branch(EXIT_CTI_PC(f, l), stub_pc, HOT_PATCHABLE);
}
/* NOTE : for inlined indirect branches linking is !NOT! atomic with respect
* to a thread executing in the cache unless using the atomic_inlined_linking
* option (unlike unlinking)
*/
void
link_indirect_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l, bool hot_patch)
{
app_pc target_tag = EXIT_TARGET_TAG(dcontext, f, l);
/* w/ indirect exits now having their stub pcs computed based
* on the cti targets, we must calculate them at a consistent
* state (we do have multi-stage modifications for inlined stubs)
*/
byte *stub_pc = (byte *) EXIT_STUB_PC(dcontext, f, l);
#ifdef CUSTOM_EXIT_STUBS
byte *fixed_stub_pc = (byte *) EXIT_FIXED_STUB_PC(dcontext, f, l);
#endif
ASSERT(!TEST(FRAG_COARSE_GRAIN, f->flags));
ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
ASSERT(LINKSTUB_INDIRECT(l->flags));
/* target is always the same, so if it's already linked, this is a nop */
if ((l->flags & LINK_LINKED) != 0) {
STATS_INC(num_indirect_already_linked);
return;
}
STATS_INC(num_indirect_links);
# ifdef WINDOWS
if (!is_shared_syscall_routine(dcontext, target_tag))
# endif
{
ibl_code_t *ibl_code =
get_ibl_routine_code(dcontext,
extract_branchtype(l->flags), f->flags);
if (ibl_code->ibl_head_is_inlined) {
/* need to make branch target the top of the exit stub */
patch_branch(EXIT_CTI_PC(f, l), stub_pc, hot_patch);
if (DYNAMO_OPTION(atomic_inlined_linking)) {
return;
}
}
}
link_indirect_exit_arch(dcontext, f, l, hot_patch, target_tag);
}
int
linkstub_unlink_entry_offset(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
ibl_code_t *ibl_code;
ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
if (!LINKSTUB_INDIRECT(l->flags))
return 0;
#ifdef WINDOWS
if (is_shared_syscall_routine(dcontext, EXIT_TARGET_TAG(dcontext, f, l)))
return 0;
#endif
ibl_code = get_ibl_routine_code(dcontext, extract_branchtype(l->flags), f->flags);
if (ibl_code->ibl_head_is_inlined)
return ibl_code->inline_unlink_offs;
else
return 0;
}
cache_pc
indirect_linkstub_target(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
ASSERT(LINKSTUB_INDIRECT(l->flags));
ASSERT(!TESTANY(LINK_NI_SYSCALL_ALL, l->flags));
#ifdef WINDOWS
if (EXIT_TARGETS_SHARED_SYSCALL(l->flags)) {
/* currently this is the only way to distinguish shared_syscall
* exit from other indirect exits and from other exits in
* a fragment containing ignorable or non-ignorable syscalls
*/
ASSERT(TEST(FRAG_HAS_SYSCALL, f->flags));
return shared_syscall_routine_ex(dcontext
_IF_X64(FRAGMENT_GENCODE_MODE(f->flags)));
}
#endif
if (TEST(FRAG_COARSE_GRAIN, f->flags)) {
/* Need to target the ibl prefix. Passing in cti works as well as stub,
* and avoids a circular dependence where linkstub_unlink_entry_offset()
* call this routine to get the target and then this routine asks for
* the stub which calls linkstub_unlink_entry_offset()...
*/
return get_coarse_ibl_prefix(dcontext, EXIT_CTI_PC(f, l),
extract_branchtype(l->flags));
} else {
return get_ibl_routine_ex(dcontext, get_ibl_entry_type(l->flags),
get_source_fragment_type(dcontext, f->flags),
extract_branchtype(l->flags)
_IF_X64(FRAGMENT_GENCODE_MODE(f->flags)));
}
}
/* based on machine state, returns which of cbr l1 and fall-through l2
* must have been taken
*/
linkstub_t *
linkstub_cbr_disambiguate(dcontext_t *dcontext, fragment_t *f,
linkstub_t *l1, linkstub_t *l2)
{
instr_t instr;
linkstub_t *taken;
instr_init(dcontext, &instr);
decode(dcontext, EXIT_CTI_PC(f, l1), &instr);
ASSERT(instr_is_cbr(&instr));
if (instr_cbr_taken(&instr, get_mcontext(dcontext), false/*post-state*/))
taken = l1;
else
taken = l2;
instr_free(dcontext, &instr);
return taken;
}
/*******************************************************************************
* COARSE-GRAIN FRAGMENT SUPPORT
*/
/* FIXME: case 10334: pass in info? */
bool
coarse_is_trace_head(cache_pc stub)
{
if (coarse_is_entrance_stub(stub)) {
cache_pc tgt = entrance_stub_jmp_target(stub);
/* FIXME: could see if tgt is a jmp and deref and cmp to
* trace_head_return_coarse_routine() to avoid the vmvector
* lookup required to find the prefix
*/
return tgt == trace_head_return_coarse_prefix(stub, NULL);
}
return false;
}
cache_pc
entrance_stub_jmp_target(cache_pc stub)
{
cache_pc jmp = entrance_stub_jmp(stub);
cache_pc tgt;
ASSERT(jmp != NULL);
tgt = (cache_pc) PC_RELATIVE_TARGET(jmp+1);
#ifdef X86
ASSERT(*jmp == JMP_OPCODE);
#elif defined(ARM)
/* FIXMED i#1551: NYI on ARM */
ASSERT_NOT_IMPLEMENTED(false);
#endif /* X86/ARM */
return tgt;
}
app_pc
entrance_stub_target_tag(cache_pc stub, coarse_info_t *info)
{
cache_pc jmp = entrance_stub_jmp(stub);
app_pc tag;
/* find the immed that is put into tls: at end of pre-jmp instr */
#ifdef X64
/* To identify whether 32-bit: we could look up the coarse_info_t
* this is part of but that's expensive so we check whether the
* tls offset has 2 high byte 0's (we always use addr16 for 32-bit).
* 32-bit:
* 67 64 c7 06 e0 0e 02 99 4e 7d addr16 mov $0x7d4e9902 -> %fs:0x0ee0
* 64-bit is split into high and low dwords:
* 65 c7 04 25 20 16 00 00 02 99 4e 7d mov $0x7d4e9902 -> %gs:0x1620
* 65 c7 04 25 24 16 00 00 00 00 00 00 mov $0x00000000 -> %gs:0x1624
* both are followed by a direct jmp.
*/
if (*((ushort *)(jmp-6)) == 0) { /* 64-bit has 2 0's for high 2 bytes of tls offs */
ptr_uint_t high32 = (ptr_uint_t) *((uint *)(jmp-4));
ptr_uint_t low32 = (ptr_uint_t)
*((uint *)(jmp - (SIZE64_MOV_PTR_IMM_TO_TLS/2) - 4));
tag = (cache_pc) ((high32 << 32) | low32);
} else { /* else fall-through to 32-bit case */
#endif
tag = *((cache_pc *)(jmp-4));
#ifdef X64
}
#endif
/* if frozen, this could be a persist-time app pc (i#670).
* we take in info so we can know mod_shift (we can decode to find it
* for unlinked but not for linked)
*/
if (info == NULL)
info = get_stub_coarse_info(stub);
if (info->mod_shift != 0 &&
tag >= info->persist_base &&
tag < info->persist_base + (info->end_pc - info->base_pc))
tag -= info->mod_shift;
return tag;
}
bool
coarse_is_indirect_stub(cache_pc pc)
{
/* match insert_jmp_to_ibl */
return instr_raw_is_tls_spill(pc, SCRATCH_REG1/*xbx/r1*/, INDIRECT_STUB_SPILL_SLOT);
}
/* caller should call fragment_coarse_entry_pclookup() ahead of time
* to avoid deadlock if caller holds info->lock
*/
bool
coarse_cti_is_intra_fragment(dcontext_t *dcontext, coarse_info_t *info,
instr_t *inst, cache_pc start_pc)
{
/* We don't know the size of the fragment but we want to support
* intra-fragment ctis for clients (i#665) so we use some
* heuristics. A real cti is either linked to a target within the
* same coarse unit (where its target will be an entry point) or
* points at a stub of some kind (frozen exit prefix or separate
* entrance stub or inlined indirect stub).
*/
cache_pc tgt = opnd_get_pc(instr_get_target(inst));
if (tgt < start_pc ||
tgt >= start_pc + MAX_FRAGMENT_SIZE ||
/* if tgt is an entry, then it's a linked exit cti
* XXX: this may acquire info->lock if it's never been called before
*/
fragment_coarse_entry_pclookup(dcontext, info, tgt) != NULL ||
/* these lookups can get expensive but should only hit them
* when have clients adding intra-fragment ctis.
* XXX: is there a min distance we could use to rule out
* being in stubs? for frozen though prefixes are
* right after cache.
*/
coarse_is_indirect_stub(tgt) ||
in_coarse_stubs(tgt) ||
in_coarse_stub_prefixes(tgt)) {
return false;
} else
return true;
}
cache_pc
coarse_indirect_stub_jmp_target(cache_pc stub)
{
#ifdef X86
cache_pc prefix_tgt, tgt;
cache_pc jmp;
size_t stub_size;
# ifdef X64
/* See the stub sequences in entrance_stub_target_tag(): 32-bit always has
* an addr prefix while 64-bit does not
*/
/* FIXME: PR 209709: test perf and remove if outweighs space */
if (*stub == ADDR_PREFIX_OPCODE)
stub_size = STUB_COARSE_INDIRECT_SIZE(FRAG_32_BIT);
else /* default */
# endif
stub_size = STUB_COARSE_INDIRECT_SIZE(0);
jmp = stub + stub_size - JMP_LONG_LENGTH;
ASSERT(*jmp == JMP_OPCODE);
prefix_tgt = (cache_pc) PC_RELATIVE_TARGET(jmp+1);
ASSERT(*prefix_tgt == JMP_OPCODE);
tgt = (cache_pc) PC_RELATIVE_TARGET(prefix_tgt+1);
return tgt;
#elif defined(ARM)
/* FIXMED i#1551: NYI on ARM */
ASSERT_NOT_IMPLEMENTED(false);
return NULL;
#endif /* X86/ARM */
}
uint
coarse_indirect_stub_size(coarse_info_t *info)
{
/* Keep in synch w/ exit_stub_size(). We export this separately since
* it's difficult to get the target to pass to exit_stub_size().
*/
return STUB_COARSE_INDIRECT_SIZE(COARSE_32_FLAG(info));
}
/* Passing in stub's info avoids a vmvector lookup */
bool
entrance_stub_linked(cache_pc stub, coarse_info_t *info /*OPTIONAL*/)
{
/* entrance stubs are of two types:
* - targeting trace heads: always point to trace_head_return_coarse,
* whether target exists or not, so are always unlinked;
* - targeting non-trace-heads: if linked, point to fragment; if unlinked,
* point to fcache_return_coarse
*/
cache_pc tgt = entrance_stub_jmp_target(stub);
/* FIXME: do vmvector just once instead of for each call */
return (tgt != trace_head_return_coarse_prefix(stub, info) &&
tgt != fcache_return_coarse_prefix(stub, info));
}
/* Returns whether it had to change page protections */
static bool
patch_coarse_branch(cache_pc stub, cache_pc tgt, bool hot_patch,
coarse_info_t *info /*OPTIONAL*/)
{
bool stubs_readonly = false;
bool stubs_restore = false;
if (DYNAMO_OPTION(persist_protect_stubs)) {
if (info == NULL)
info = get_stub_coarse_info(stub);
ASSERT(info != NULL);
if (info->stubs_readonly) {
stubs_readonly = true;
stubs_restore = true;
/* if we don't preserve mapped-in COW state the protection change
* will fail (case 10570)
*/
make_copy_on_writable((byte *)PAGE_START(entrance_stub_jmp(stub)),
/* stub jmp can't cross page boundary (can't
* cross cache line in fact) */
PAGE_SIZE);
if (DYNAMO_OPTION(persist_protect_stubs_limit) > 0) {
info->stubs_write_count++;
if (info->stubs_write_count >
DYNAMO_OPTION(persist_protect_stubs_limit)) {
SYSLOG_INTERNAL_WARNING_ONCE("pcache stubs over write limit");
STATS_INC(pcache_unprot_over_limit);
stubs_restore = false;
info->stubs_readonly = false;
}
}
}
}
patch_branch(entrance_stub_jmp(stub), tgt, HOT_PATCHABLE);
if (stubs_restore)
make_unwritable((byte *)PAGE_START(entrance_stub_jmp(stub)), PAGE_SIZE);
return stubs_readonly;
}
/* Passing in stub's info avoids a vmvector lookup */
void
link_entrance_stub(dcontext_t *dcontext, cache_pc stub, cache_pc tgt,
bool hot_patch, coarse_info_t *info /*OPTIONAL*/)
{
ASSERT(DYNAMO_OPTION(coarse_units));
ASSERT(self_owns_recursive_lock(&change_linking_lock));
LOG(THREAD, LOG_LINKS, 5, "link_entrance_stub "PFX"\n", stub);
if (patch_coarse_branch(stub, tgt, hot_patch, info))
STATS_INC(pcache_unprot_link);
/* We check this afterward since this link may be what makes it consistent
* FIXME: pass in arg to not check target? Then call before and after */
ASSERT(coarse_is_entrance_stub(stub));
}
/* Passing in stub's info avoids a vmvector lookup */
void
unlink_entrance_stub(dcontext_t *dcontext, cache_pc stub, uint flags,
coarse_info_t *info /*OPTIONAL*/)
{
cache_pc tgt;
ASSERT(DYNAMO_OPTION(coarse_units));
ASSERT(coarse_is_entrance_stub(stub));
ASSERT(self_owns_recursive_lock(&change_linking_lock));
LOG(THREAD, LOG_LINKS, 5,
"unlink_entrance_stub "PFX"\n", stub);
if (TESTANY(FRAG_IS_TRACE_HEAD|FRAG_IS_TRACE, flags))
tgt = trace_head_return_coarse_prefix(stub, info);
else
tgt = fcache_return_coarse_prefix(stub, info);
if (patch_coarse_branch(stub, tgt, HOT_PATCHABLE, info))
STATS_INC(pcache_unprot_unlink);
}
cache_pc
entrance_stub_from_cti(cache_pc cti)
{
cache_pc disp = exit_cti_disp_pc(cti);
cache_pc tgt = (cache_pc) PC_RELATIVE_TARGET(disp);
return tgt;
}
/*******************************************************************************/
/* Patch list support routines */
void
init_patch_list(patch_list_t *patch, patch_list_type_t type)
{
patch->num_relocations = 0;
/* Cast to int to avoid a tautological comparison warning from clang. */
ASSERT_TRUNCATE(patch->type, ushort, (int)type);
patch->type = (ushort) type;
}
/* add an instruction to patch list and address of location for future updates */
/* Use the type checked wrappers add_patch_entry or add_patch_marker */
void
add_patch_entry_internal(patch_list_t *patch, instr_t *instr, ushort patch_flags,
short instruction_offset,
ptr_uint_t value_location_offset)
{
uint i = patch->num_relocations;
ASSERT(patch->num_relocations < MAX_PATCH_ENTRIES);
/* Since in debug build we have the extra slots for stats, it's important
* to provide a useful release build message
*/
if (patch->num_relocations >= MAX_PATCH_ENTRIES) {
SYSLOG_CUSTOM_NOTIFY(SYSLOG_CRITICAL, MSG_EXCEPTION, 4,
"Maximum patch entries exceeded",
get_application_name(), get_application_pid(),
"<maxpatch>", "Maximum patch entries exceeded");
os_terminate(get_thread_private_dcontext(), TERMINATE_PROCESS);
ASSERT_NOT_REACHED();
}
LOG(THREAD_GET, LOG_EMIT, 4,
"add_patch_entry[%d] value_location_offset="PFX"\n", i,
value_location_offset);
patch->entry[i].where.instr = instr;
patch->entry[i].patch_flags = patch_flags;
patch->entry[i].value_location_offset = value_location_offset;
patch->entry[i].instr_offset = instruction_offset;
patch->num_relocations++;
}
/* add an instruction to patch list to retrieve its offset later.
Takes an instruction and an offset within the instruction.
Result: The offset within an encoded instruction stream will
be stored in target_offset by encode_with_patch_list
*/
void
add_patch_marker(patch_list_t *patch, instr_t *instr, ushort patch_flags,
short instr_offset, ptr_uint_t *target_offset /* OUT */)
{
add_patch_entry_internal(patch, instr, (ushort) (patch_flags | PATCH_MARKER),
instr_offset, (ptr_uint_t) target_offset);
}
/* remove PATCH_MARKER entries since not needed for dynamic updates */
static INLINE_ONCE void
remove_assembled_patch_markers(dcontext_t *dcontext, patch_list_t *patch)
{
ushort i=0, j=0;
/* we can remove the PATCH_MARKER entries after encoding,
and so patch_emitted_code won't even need to check for PATCH_MARKER
*/
while (j < patch->num_relocations) {
if (TEST(PATCH_MARKER, patch->entry[j].patch_flags)) {
LOG(THREAD, LOG_EMIT, 4,
"remove_assembled_patch_markers: removing marker %d\n", j);
} else {
patch->entry[i] = patch->entry[j];
i++;
}
j++;
}
LOG(THREAD, LOG_EMIT, 3, "remove_assembled_patch_markers: relocations %d, left only %d\n",
patch->num_relocations, i);
patch->num_relocations = i;
}
/* Indirect all instructions instead of later patching */
static void
relocate_patch_list(dcontext_t *dcontext, patch_list_t *patch,
instrlist_t *ilist)
{
instr_t *inst;
uint cur = 0;
LOG(THREAD, LOG_EMIT, 3, "relocate_patch_list ["PFX"]\n", patch);
/* go through the instructions and "relocate" by indirectly using XDI */
for (inst = instrlist_first(ilist); inst; inst = instr_get_next(inst)) {
if (cur < patch->num_relocations &&
inst == patch->entry[cur].where.instr) {
ASSERT(!TEST(PATCH_OFFSET_VALID, patch->entry[cur].patch_flags));
if (!TEST(PATCH_MARKER, patch->entry[cur].patch_flags)) {
opnd_t opnd;
ASSERT(instr_num_srcs(inst) > 0);
opnd = instr_get_src(inst, 0);
DOLOG(4, LOG_EMIT, {
LOG(THREAD, LOG_EMIT, 2,
"encode_with_patch_list: patch_entry_t[%d] before update \n");
instr_disassemble(dcontext, inst, THREAD);
LOG(THREAD, LOG_EMIT, 2, "\n");
});
/* we assume that per_thread_t will be in XDI,
and the displacement is in value_location_offset */
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int
(patch->entry[cur].value_location_offset)));
if (opnd_is_near_base_disp(opnd)) {
/* indirect through XDI and update displacement */
opnd_set_disp(&opnd, (int) patch->entry[cur].value_location_offset);
opnd_replace_reg(&opnd, REG_NULL, SCRATCH_REG5/*xdi/r5*/);
} else if (opnd_is_immed_int(opnd)) {
/* indirect through XDI and set displacement */
/* converting AND $0x00003fff, %xcx -> %xcx
into AND mask(%xdi), %xcx -> %xcx
*/
opnd = opnd_create_base_disp
(SCRATCH_REG5/*xdi/r5*/, REG_NULL, 0,
(int) patch->entry[cur].value_location_offset, OPSZ_4);
}
instr_set_src(inst, 0, opnd);
DOLOG(3, LOG_EMIT, {
LOG(THREAD, LOG_EMIT, 2,
"encode_with_patch_list: patch_entry_t[%d] after update \n");
instr_disassemble(dcontext, inst, THREAD);
LOG(THREAD, LOG_EMIT, 2, "\n");
});
}
cur++;
}
}
}
/* Updates patch list with offsets in assembled instruction list */
/* Cf: instrlist_encode which does not support a patch list */
/* Returns length of emitted code */
int
encode_with_patch_list(dcontext_t *dcontext, patch_list_t *patch,
instrlist_t *ilist, cache_pc start_pc)
{
instr_t *inst;
uint len;
uint cur;
cache_pc pc = start_pc;
ASSERT(patch->num_relocations < MAX_PATCH_ENTRIES);
if (patch->type == PATCH_TYPE_INDIRECT_XDI) {
relocate_patch_list(dcontext, patch, ilist);
}
/* now encode the instructions */
/* must set note fields first with offset */
len = 0;
for (inst = instrlist_first(ilist); inst; inst = instr_get_next(inst)) {
instr_set_note(inst, (void *)(ptr_uint_t)len);
len += instr_length(dcontext, inst);
}
cur = 0;
/* after instruction list is assembled we collect the offsets */
for (inst = instrlist_first(ilist); inst; inst = instr_get_next(inst)) {
short offset_in_instr = patch->entry[cur].instr_offset;
byte *nxt_pc = instr_encode(dcontext, inst, pc);
ASSERT(nxt_pc != NULL);
len = (int) (nxt_pc - pc);
pc = nxt_pc;
if (cur < patch->num_relocations &&
inst == patch->entry[cur].where.instr) {
ASSERT(!TEST(PATCH_OFFSET_VALID, patch->entry[cur].patch_flags));
/* support positive offsets from beginning and negative - from end of instruction */
if (offset_in_instr < 0) {
/* grab offset offset_in_instr bytes from the end of instruction */
/* most commonly -4 for a 32bit immediate */
patch->entry[cur].where.offset =
((pc + offset_in_instr) - start_pc);
} else {
/* grab offset after skipping offset_in_instr from beginning of instruction */
patch->entry[cur].where.offset =
((pc - len + offset_in_instr) - start_pc);
}
patch->entry[cur].patch_flags |= PATCH_OFFSET_VALID;
LOG(THREAD, LOG_EMIT, 4,
"encode_with_patch_list: patch_entry_t[%d] offset="PFX"\n",
cur, patch->entry[cur].where.offset);
if (TEST(PATCH_MARKER, patch->entry[cur].patch_flags)) {
/* treat value_location_offset as an output argument
and store there the computed offset,
*/
ptr_uint_t *output_value = (ptr_uint_t *)
patch->entry[cur].value_location_offset;
ptr_uint_t output_offset = patch->entry[cur].where.offset;
if (TEST(PATCH_ASSEMBLE_ABSOLUTE, patch->entry[cur].patch_flags)) {
ASSERT(!TEST(PATCH_UINT_SIZED, patch->entry[cur].patch_flags));
output_offset += (ptr_uint_t)start_pc;
}
if (TEST(PATCH_UINT_SIZED, patch->entry[cur].patch_flags)) {
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint(output_offset)));
*((uint *)output_value) = (uint) output_offset;
} else
*output_value = output_offset;
}
LOG(THREAD, LOG_EMIT, 4,
"encode_with_patch_list [%d] extras patch_flags=0x%x value_offset="
PFX"\n", cur, patch->entry[cur].patch_flags,
patch->entry[cur].value_location_offset);
cur++;
}
}
/* assuming patchlist is in the same order as ilist, we should have seen all */
LOG(THREAD, LOG_EMIT, 4, "cur %d, num %d", cur, patch->num_relocations);
ASSERT(cur == patch->num_relocations);
remove_assembled_patch_markers(dcontext, patch);
ASSERT(CHECK_TRUNCATE_TYPE_int(pc - start_pc));
return (int)(pc - start_pc);
}
#ifdef DEBUG
void
print_patch_list(patch_list_t *patch)
{
uint i;
LOG(THREAD_GET, LOG_EMIT, 4, "patch="PFX" num_relocations=%d\n",
patch, patch->num_relocations);
for(i=0; i<patch->num_relocations; i++) {
ASSERT(TEST(PATCH_OFFSET_VALID, patch->entry[i].patch_flags));
LOG(THREAD_GET, LOG_EMIT, 4,
"patch_list [%d] offset="PFX" patch_flags=%d value_offset="PFX"\n", i,
patch->entry[i].where.offset,
patch->entry[i].patch_flags,
patch->entry[i].value_location_offset);
}
}
# ifdef INTERNAL
/* disassembles code adding patch list labels */
static void
disassemble_with_annotations(dcontext_t *dcontext, patch_list_t *patch,
byte *start_pc, byte *end_pc)
{
byte *pc = start_pc;
uint cur = 0;
do {
if (cur < patch->num_relocations &&
pc >= start_pc + patch->entry[cur].where.offset) {
ASSERT(TEST(PATCH_OFFSET_VALID, patch->entry[cur].patch_flags));
/* this is slightly off - we'll mark next instruction,
but is good enough for this purpose */
LOG(THREAD, LOG_EMIT, 2, "%d:", cur);
cur++;
} else {
LOG(THREAD, LOG_EMIT, 2, " ");
}
pc = disassemble_with_bytes(dcontext, pc, THREAD);
} while (pc < end_pc);
LOG(THREAD, LOG_EMIT, 2, "\n");
}
# endif
#endif
/* updates emitted code according to patch list */
static void
patch_emitted_code(dcontext_t *dcontext, patch_list_t *patch, byte *start_pc)
{
uint i;
/* FIXME: can get this as a patch list entry through indirection */
per_thread_t *pt = (per_thread_t *) dcontext->fragment_field;
ASSERT(dcontext != GLOBAL_DCONTEXT && dcontext != NULL);
LOG(THREAD, LOG_EMIT, 2, "patch_emitted_code start_pc="PFX" pt="PFX"\n",
start_pc);
if (patch->type != PATCH_TYPE_ABSOLUTE) {
LOG(THREAD, LOG_EMIT, 2,
"patch_emitted_code type=%d indirected, nothing to patch\n", patch->type);
/* FIXME: propagate the check earlier to save the extraneous calls
to update_indirect_exit_stub and update_indirect_branch_lookup
*/
return;
}
DOLOG(4, LOG_EMIT, {
print_patch_list(patch);
});
for(i=0; i<patch->num_relocations; i++) {
byte *pc = start_pc + patch->entry[i].where.offset;
/* value address, (think for example of pt->trace.hash_mask) */
ptr_uint_t value;
char *vaddr = NULL;
if (TEST(PATCH_PER_THREAD, patch->entry[i].patch_flags)) {
vaddr = (char *)pt + patch->entry[i].value_location_offset;
} else if (TEST(PATCH_UNPROT_STAT, patch->entry[i].patch_flags)) {
/* separate the two parts of the stat */
uint unprot_offs = (uint) (patch->entry[i].value_location_offset) >> 16;
uint field_offs = (uint) (patch->entry[i].value_location_offset) & 0xffff;
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint
(patch->entry[i].value_location_offset)));
vaddr = (*((char **)((char *)pt + unprot_offs))) + field_offs;
LOG(THREAD, LOG_EMIT, 4,
"patch_emitted_code [%d] value "PFX" => 0x%x 0x%x => "PFX"\n",
i, patch->entry[i].value_location_offset, unprot_offs, field_offs, vaddr);
}
else
ASSERT_NOT_REACHED();
ASSERT(TEST(PATCH_OFFSET_VALID, patch->entry[i].patch_flags));
ASSERT(!TEST(PATCH_MARKER, patch->entry[i].patch_flags));
if (!TEST(PATCH_TAKE_ADDRESS, patch->entry[i].patch_flags)) {
/* use value pointed by computed address */
if (TEST(PATCH_UINT_SIZED, patch->entry[i].patch_flags))
value = (ptr_uint_t) *((uint *)vaddr);
else
value = *(ptr_uint_t*)vaddr;
} else {
ASSERT(!TEST(PATCH_UINT_SIZED, patch->entry[i].patch_flags));
value = (ptr_uint_t)vaddr; /* use computed address */
}
LOG(THREAD, LOG_EMIT, 4,
"patch_emitted_code [%d] offset="PFX" patch_flags=%d value_offset="PFX
" vaddr="PFX" value="PFX"\n", i,
patch->entry[i].where.offset, patch->entry[i].patch_flags,
patch->entry[i].value_location_offset, vaddr, value);
if (TEST(PATCH_UINT_SIZED, patch->entry[i].patch_flags)) {
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint(value)));
*((uint*)pc) = (uint) value;
} else
*((ptr_uint_t *)pc) = value;
LOG(THREAD, LOG_EMIT, 4,
"patch_emitted_code: updated pc *"PFX" = "PFX"\n", pc, value);
}
STATS_INC(emit_patched_fragments);
DOSTATS({
/* PR 217008: avoid gcc warning from truncation assert in XSTATS_ADD_DC */
int tmp_num = patch->num_relocations;
STATS_ADD(emit_patched_relocations, tmp_num);
});
LOG(THREAD, LOG_EMIT, 4, "patch_emitted_code done\n");
}
/* Updates an indirect branch exit stub with the latest hashtable mask
* and hashtable address
* See also update_indirect_branch_lookup
*/
void
update_indirect_exit_stub(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
generated_code_t *code = get_emitted_routines_code
(dcontext _IF_X64(FRAGMENT_GENCODE_MODE(f->flags)));
# ifdef CUSTOM_EXIT_STUBS
byte *start_pc = (byte *) EXIT_FIXED_STUB_PC(dcontext, f, l);
# else
byte *start_pc = (byte *) EXIT_STUB_PC(dcontext, f, l);
# endif
ibl_branch_type_t branch_type;
ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
ASSERT(LINKSTUB_INDIRECT(l->flags));
ASSERT(EXIT_HAS_STUB(l->flags, f->flags));
/* Shared use indirection so no patching needed -- caller should check */
ASSERT(!TEST(FRAG_SHARED, f->flags));
#ifdef WINDOWS
/* Do not touch shared_syscall */
if (EXIT_TARGET_TAG(dcontext, f, l) ==
shared_syscall_routine_ex(dcontext _IF_X64(FRAGMENT_GENCODE_MODE(f->flags))))
return;
#endif
branch_type = extract_branchtype(l->flags);
LOG(THREAD, LOG_EMIT, 4, "update_indirect_exit_stub: f->tag="PFX"\n",
f->tag);
if (DYNAMO_OPTION(disable_traces) && !code->bb_ibl[branch_type].ibl_head_is_inlined) {
return;
}
if (TEST(FRAG_IS_TRACE, f->flags)) {
ASSERT(code->trace_ibl[branch_type].ibl_head_is_inlined);
patch_emitted_code(dcontext, &code->trace_ibl[branch_type].ibl_stub_patch, start_pc);
} else {
ASSERT(code->bb_ibl[branch_type].ibl_head_is_inlined);
patch_emitted_code(dcontext, &code->bb_ibl[branch_type].ibl_stub_patch, start_pc);
}
}
/*###########################################################################
*
* fragment_t Prefixes
*
* Two types: indirect branch target, which restores eflags and xcx, and
* normal prefix, which just restores xcx
*/
/* Indirect Branch Target Prefix
* We have 3 different prefixes: one if we don't need to restore eflags, one
* if we need to restore just using sahf, and one if we also need to restore
* the overflow flag OF.
*
* FIXME: currently we cache-align the prefix, not the normal
* entry point...if prefix gets much longer, might want to add
* nops to get normal entry cache-aligned?
*/
/* for now all ibl targets must use same scratch locations: tls or not, no mixture */
#define RESTORE_XAX_PREFIX(flags) \
((FRAG_IS_X86_TO_X64(flags) && \
IF_X64_ELSE(DYNAMO_OPTION(x86_to_x64_ibl_opt), false)) ? \
SIZE64_MOV_R8_TO_XAX : \
(IBL_EFLAGS_IN_TLS() ? SIZE_MOV_XAX_TO_TLS(flags, false) : SIZE32_MOV_XAX_TO_ABS))
#define PREFIX_BASE(flags) \
(RESTORE_XAX_PREFIX(flags) + FRAGMENT_BASE_PREFIX_SIZE(flags))
int
fragment_prefix_size(uint flags)
{
if (use_ibt_prefix(flags)) {
bool use_eflags_restore = TEST(FRAG_IS_TRACE, flags) ?
!DYNAMO_OPTION(trace_single_restore_prefix) :
!DYNAMO_OPTION(bb_single_restore_prefix);
/* The common case is !INTERNAL_OPTION(unsafe_ignore_eflags*) so
* PREFIX_BASE(flags) is defined accordingly, and we subtract from it to
* get the correct value when the option is on.
*/
if (INTERNAL_OPTION(unsafe_ignore_eflags_prefix)) {
if (INTERNAL_OPTION(unsafe_ignore_eflags_ibl)) {
ASSERT(PREFIX_BASE(flags) - RESTORE_XAX_PREFIX(flags) >= 0);
return PREFIX_BASE(flags) - RESTORE_XAX_PREFIX(flags);
} else {
/* still need to restore xax, just don't restore eflags */
return PREFIX_BASE(flags);
}
}
if (!use_eflags_restore)
return PREFIX_BASE(flags) - RESTORE_XAX_PREFIX(flags);
if (TEST(FRAG_WRITES_EFLAGS_6, flags)) /* no flag restoration needed */
return PREFIX_BASE(flags);
else if (TEST(FRAG_WRITES_EFLAGS_OF, flags)) /* no OF restoration needed */
return (PREFIX_BASE(flags) + PREFIX_SIZE_FIVE_EFLAGS);
else /* must restore all 6 flags */
if (INTERNAL_OPTION(unsafe_ignore_overflow)) {
/* do not restore OF */
return (PREFIX_BASE(flags) + PREFIX_SIZE_FIVE_EFLAGS);
} else {
return (PREFIX_BASE(flags) + PREFIX_SIZE_RESTORE_OF +
PREFIX_SIZE_FIVE_EFLAGS);
}
} else {
#ifdef CLIENT_INTERFACE
if (dynamo_options.bb_prefixes)
return FRAGMENT_BASE_PREFIX_SIZE(flags);
else
#endif
return 0;
}
}
#ifdef PROFILE_RDTSC
/***************************************************************************
***************************************************************************
** PROFILING USING RDTSC
**
**/
/*
We want the profile code to not count towards fragment times.
So we stop time as quickly as possible, in assembly here instead of
in the profile_fragment_enter function, and start time again as late
as possible:
mov %eax, eax_offset(dcontext) # save eax
mov %edx, edx_offset(dcontext) # save edx
rdtsc # stop time
switch to dynamo stack
pushfl # save eflags (call will clobber)
mov %ecx, ecx_offset(dcontext) # save ecx
pushl %edx # pass time as arg
pushl %eax
pushil &fragment_address # pass &frag as arg
call profile_fragment_enter #
addl $0xc, %esp # clean up args
popl %ecx # restore ecx
popfl # restore eflags
restore app stack
rdtsc # start time
movl %eax, start_time_OFFS(dcontext) # store time value
movl %edx, 4+start_time_OFFS(dcontext) # store time value
mov eax_offset(dcontext), %eax # restore eax
mov edx_offset(dcontext), %edx # restore edx
mov ecx_offset(dcontext), %ecx # restore ecx
*/
static uint profile_call_length = 0;
static int profile_call_fragment_offset = 0;
static int profile_call_call_offset = 0;
static byte profile_call_buf[128];
static dcontext_t *buffer_dcontext;
static void build_profile_call_buffer(void);
uint
profile_call_size()
{
if (profile_call_length == 0)
build_profile_call_buffer();
return profile_call_length;
}
/* if insert_profile_call emits its code into the trace buffer, this
* routine must be called once the fragment is created and the code is
* in the fcache
*/
void
finalize_profile_call(dcontext_t *dcontext, fragment_t *f)
{
byte *start_pc = (byte *) FCACHE_ENTRY_PC(f);
byte *pc;
byte *prev_pc;
instr_t instr;
instr_init(dcontext, &instr);
/* fill in address of owning fragment now that that fragment exists */
pc = start_pc + profile_call_fragment_offset;
/* PR 248210: unsupported feature on x64 */
IF_X64(ASSERT_NOT_IMPLEMENTED(false));
*((int *)pc) = (uint)f;
/* fill in call's proper pc-relative offset now that code is
* in its final location in fcache
*/
pc = start_pc + profile_call_call_offset;
IF_X64(ASSERT_NOT_IMPLEMENTED(false));
*((int *)pc) = (int)&profile_fragment_enter - (int)pc - 4;
/* must fix up all dcontext references to point to the right dcontext */
pc = start_pc;
do {
prev_pc = pc;
instr_reset(dcontext, &instr);
pc = decode(dcontext, pc, &instr);
ASSERT(instr_valid(&instr)); /* our own code! */
/* look for loads and stores that reference buffer_dcontext */
if (instr_get_opcode(&instr) == OP_mov_ld &&
opnd_is_near_base_disp(instr_get_src(&instr, 0)) &&
opnd_get_base(instr_get_src(&instr, 0)) == REG_NULL &&
opnd_get_index(instr_get_src(&instr, 0)) == REG_NULL) {
/* if not really dcontext value, update_ will return old value */
instr_set_src(&instr, 0,
update_dcontext_address(instr_get_src(&instr, 0),
buffer_dcontext, dcontext));
}
else if (instr_get_opcode(&instr) == OP_mov_st &&
opnd_is_near_base_disp(instr_get_dst(&instr, 0)) &&
opnd_get_base(instr_get_dst(&instr, 0)) == REG_NULL &&
opnd_get_index(instr_get_dst(&instr, 0)) == REG_NULL) {
/* if not really dcontext value, update_ will return old value */
instr_set_dst(&instr, 0,
update_dcontext_address(instr_get_dst(&instr, 0),
buffer_dcontext, dcontext));
}
if (!instr_raw_bits_valid(&instr)) {
DEBUG_DECLARE(byte *nxt_pc;)
DEBUG_DECLARE(nxt_pc = ) instr_encode(dcontext, &instr, prev_pc);
ASSERT(nxt_pc != NULL);
}
} while (pc < start_pc + profile_call_length);
instr_free(dcontext, &instr);
}
void
insert_profile_call(cache_pc start_pc)
{
if (profile_call_length == 0)
build_profile_call_buffer();
memcpy((void *)start_pc, profile_call_buf, profile_call_length);
/* if thread-private, we change to proper dcontext when finalizing */
}
/* This routine builds the profile call code using the instr_t
* abstraction, then emits it into a buffer to be saved.
* The code can then be directly copied whenever needed.
* Assumption: this thread's dcontext must have been created
* before calling this function.
*/
static void
build_profile_call_buffer()
{
byte *pc, *nxt_pc;
instrlist_t ilist;
instr_t *inst;
int start_time_offs;
dcontext_t *dcontext = get_thread_private_dcontext();
ASSERT(dcontext != NULL);
/* remember dcontext for easy replacement when finalizing: */
buffer_dcontext = dcontext;
/* we require a dcontext to find this offset because it may
* or may not be pushed to a quadword boundary, making it
* hard to hardcode it
*/
start_time_offs = (int)(&(dcontext->start_time)) - (int)dcontext;
/* initialize the ilist */
instrlist_init(&ilist);
APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EAX, SCRATCH_REG0_OFFS));
APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EDX, SCRATCH_REG3_OFFS));
/* get time = rdtsc */
APP(&ilist, INSTR_CREATE_rdtsc(dcontext));
/* swap to dstack */
APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_ESP, XSP_OFFSET));
APP(&ilist, instr_create_restore_dynamo_stack(dcontext));
/* finish saving caller-saved registers
* The profile_fragment_enter function will save the callee-saved
* regs (ebx, ebp, esi, edi) and will restore ebp and esp, but we need
* to explicitly save eax, ecx, and edx
*/
APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_ECX, SCRATCH_REG2_OFFS));
/* save eflags (call will clobber) */
APP(&ilist, INSTR_CREATE_RAW_pushf(dcontext));
#ifdef WINDOWS
/* must preserve the LastErrorCode (if the profile procedure
* calls a Win32 API routine it could overwrite the app's error code)
* currently this is done in the profile routine itself --
* if you want to move it here, look at the code in profile.c
*/
#endif
/* push time as 2nd argument for call */
APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_EDX)));
APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_EAX)));
/* push fragment address as 1st argument for call
* fragment isn't built yet, we fill it in in finalize_profile_call
*/
APP(&ilist, INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(0)));
/* call near rel: 4-byte pc-relative offset from start of next instr
* we don't have that offset now so we fill it in later (in
* finalize_profile_call)
*/
APP(&ilist, INSTR_CREATE_call(dcontext, opnd_create_pc(NULL)));
/* pop arguments: addl $0xc, %esp */
APP(&ilist,
INSTR_CREATE_add(dcontext, opnd_create_reg(REG_ESP), OPND_CREATE_INT8(0xc)));
/* restore eflags */
APP(&ilist, INSTR_CREATE_RAW_popf(dcontext));
/* restore caller-saved registers */
APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_ECX, SCRATCH_REG2_OFFS));
/* restore app stack */
APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_ESP, XSP_OFFSET));
/* get start time = rdtsc */
APP(&ilist, INSTR_CREATE_rdtsc(dcontext));
/* copy start time into dcontext */
APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EAX, start_time_offs));
APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EDX, start_time_offs+4));
/* finish restoring caller-saved registers */
APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_EDX, SCRATCH_REG3_OFFS));
APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_EAX, SCRATCH_REG0_OFFS));
/* now encode the instructions */
pc = profile_call_buf;
for (inst = instrlist_first(&ilist); inst; inst = instr_get_next(inst)) {
if (instr_is_call_direct(inst)) {
/* push_immed was just before us, so fragment address
* starts 4 bytes before us:
*/
profile_call_fragment_offset = (int) (pc - 4 - profile_call_buf);
/* call opcode is 1 byte, offset is next: */
profile_call_call_offset = (int) (pc + 1 - profile_call_buf);
}
/* we have no jumps with instr_t targets so we don't need to set note
* field in order to use instr_encode
*/
nxt_pc = instr_encode(dcontext, inst, (void*)pc);
ASSERT(nxt_pc != NULL);
profile_call_length += nxt_pc - pc;
pc = nxt_pc;
ASSERT(profile_call_length < 128);
}
/* free the instrlist_t elements */
instrlist_clear(dcontext, &ilist);
}
#endif /* PROFILE_RDTSC */
#ifdef WINDOWS
# ifdef CLIENT_INTERFACE
/* Leaving in place old notes on LastError preservation: */
/* inlined versions of save/restore last error by reading of TIB */
/* If our inlined version fails on a later version of windows
should verify [GS]etLastError matches the disassembly below.
*/
/* Win2000: kernel32!SetLastError: */
/* 77E87671: 55 push ebp */
/* 77E87672: 8B EC mov ebp,esp */
/* 77E87674: 64 A1 18 00 00 00 mov eax,fs:[00000018] */
/* 77E8767A: 8B 4D 08 mov ecx,dword ptr [ebp+8] */
/* 77E8767D: 89 48 34 mov dword ptr [eax+34h],ecx */
/* 77E87680: 5D pop ebp */
/* 77E87681: C2 04 00 ret 4 */
/* Win2003: ntdll!RtlSetLastWin32Error: optimized to */
/* 77F45BB4: 64 A1 18 00 00 00 mov eax,fs:[00000018] */
/* 77F45BBA: 8B 4C 24 04 mov ecx,dword ptr [esp+4] */
/* 77F45BBE: 89 48 34 mov dword ptr [eax+34h],ecx */
/* 77F45BC1: C2 04 00 ret 4 */
/* See InsideWin2k, p. 329 SelfAddr fs:[18h] simply has the linear address of the TIB
while we're interested only in LastError which is at fs:[34h] */
/* Therefore all we need is a single instruction! */
/* 64 a1 34 00 00 00 mov dword ptr fs:[34h],errno_register */
/* Overall savings: 7 instructions, 5 data words */
/*kernel32!GetLastError:*/
/* 77E87684: 64 A1 18 00 00 00 mov eax,fs:[00000018] */
/* 77E8768A: 8B 40 34 mov eax,dword ptr [eax+34h] */
/* 77E8768D: C3 ret */
/* All we need is a single instruction: */
/* 77F45BBE: 89 48 34 mov reg_result, dword ptr fs:[34h] */
/* i#249: isolate app's PEB+TEB by keeping our own copy and swapping on cxt switch
* For clean calls we share this in clean_call_{save,restore} (i#171, i#1349).
*/
void
preinsert_swap_peb(dcontext_t *dcontext, instrlist_t *ilist, instr_t *next,
bool absolute, reg_id_t reg_dr, reg_id_t reg_scratch, bool to_priv)
{
/* We assume PEB is globally constant and we don't need per-thread pointers
* and can use use absolute pointers known at init time
*/
PEB *tgt_peb = to_priv ? get_private_peb() : get_own_peb();
reg_id_t scratch32 = IF_X64_ELSE(reg_64_to_32(reg_scratch), reg_scratch);
ASSERT(INTERNAL_OPTION(private_peb) && should_swap_peb_pointer());
ASSERT(reg_dr != REG_NULL && reg_scratch != REG_NULL);
/* can't store 64-bit immed, so we use scratch reg, for 32-bit too since
* long 32-bit-immed-store instr to fs:offs is slow to decode
*/
PRE(ilist, next, INSTR_CREATE_mov_imm
(dcontext, opnd_create_reg(reg_scratch), OPND_CREATE_INTPTR((ptr_int_t)tgt_peb)));
PRE(ilist, next, XINST_CREATE_store
(dcontext, opnd_create_far_base_disp
(SEG_TLS, REG_NULL, REG_NULL, 0, PEB_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(reg_scratch)));
/* Preserve app's TEB->LastErrorValue. We used to do this separately b/c
* DR at one point long ago made some win32 API calls: now we only have to
* do this when loading private libraries. We assume no private library
* code needs to preserve LastErrorCode across app execution.
*/
if (to_priv) {
/* yes errno is 32 bits even on x64 */
PRE(ilist, next, XINST_CREATE_load
(dcontext, opnd_create_reg(scratch32), opnd_create_far_base_disp
(SEG_TLS, REG_NULL, REG_NULL, 0, ERRNO_TIB_OFFSET, OPSZ_4)));
PRE(ilist, next, SAVE_TO_DC_VIA_REG
(absolute, dcontext, reg_dr, scratch32, APP_ERRNO_OFFSET));
} else {
PRE(ilist, next, RESTORE_FROM_DC_VIA_REG
(absolute, dcontext, reg_dr, scratch32, APP_ERRNO_OFFSET));
PRE(ilist, next, XINST_CREATE_store
(dcontext, opnd_create_far_base_disp
(SEG_TLS, REG_NULL, REG_NULL, 0, ERRNO_TIB_OFFSET, OPSZ_4),
opnd_create_reg(scratch32)));
}
#ifdef X64
/* We have to swap TEB->StackLimit (i#1102). For now I'm only doing this
* on X64, though it seems possible for 32-bit stacks to be up high too?
* We have never seen that.
*/
if (to_priv) {
PRE(ilist, next, XINST_CREATE_load
(dcontext, opnd_create_reg(reg_scratch), opnd_create_far_base_disp
(SEG_TLS, REG_NULL, REG_NULL, 0, BASE_STACK_TIB_OFFSET, OPSZ_PTR)));
PRE(ilist, next, SAVE_TO_DC_VIA_REG
(absolute, dcontext, reg_dr, reg_scratch, APP_STACK_LIMIT_OFFSET));
PRE(ilist, next, RESTORE_FROM_DC_VIA_REG
(absolute, dcontext, reg_dr, reg_scratch, DSTACK_OFFSET));
PRE(ilist, next, INSTR_CREATE_lea
(dcontext, opnd_create_reg(reg_scratch),
opnd_create_base_disp(reg_scratch, REG_NULL, 0,
-(int)DYNAMORIO_STACK_SIZE, OPSZ_lea)));
PRE(ilist, next, XINST_CREATE_store
(dcontext, opnd_create_far_base_disp
(SEG_TLS, REG_NULL, REG_NULL, 0, BASE_STACK_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(reg_scratch)));
} else {
PRE(ilist, next, RESTORE_FROM_DC_VIA_REG
(absolute, dcontext, reg_dr, reg_scratch, APP_STACK_LIMIT_OFFSET));
PRE(ilist, next, XINST_CREATE_store
(dcontext, opnd_create_far_base_disp
(SEG_TLS, REG_NULL, REG_NULL, 0, BASE_STACK_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(reg_scratch)));
}
#endif
/* We also swap TEB->NlsCache. Unlike TEB->ProcessEnvironmentBlock, which is
* constant, and TEB->LastErrorCode, which is not peristent, we have to maintain
* both values and swap between them which is expensive.
*/
PRE(ilist, next, XINST_CREATE_load
(dcontext, opnd_create_reg(reg_scratch), opnd_create_far_base_disp
(SEG_TLS, REG_NULL, REG_NULL, 0, NLS_CACHE_TIB_OFFSET, OPSZ_PTR)));
PRE(ilist, next, SAVE_TO_DC_VIA_REG
(absolute, dcontext, reg_dr, reg_scratch,
to_priv ? APP_NLS_CACHE_OFFSET : PRIV_NLS_CACHE_OFFSET));
PRE(ilist, next, RESTORE_FROM_DC_VIA_REG
(absolute, dcontext, reg_dr, reg_scratch,
to_priv ? PRIV_NLS_CACHE_OFFSET : APP_NLS_CACHE_OFFSET));
PRE(ilist, next, XINST_CREATE_store
(dcontext, opnd_create_far_base_disp
(SEG_TLS, REG_NULL, REG_NULL, 0, NLS_CACHE_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(reg_scratch)));
/* We also swap TEB->FlsData. Unlike TEB->ProcessEnvironmentBlock, which is
* constant, and TEB->LastErrorCode, which is not peristent, we have to maintain
* both values and swap between them which is expensive.
*/
PRE(ilist, next, XINST_CREATE_load
(dcontext, opnd_create_reg(reg_scratch), opnd_create_far_base_disp
(SEG_TLS, REG_NULL, REG_NULL, 0, FLS_DATA_TIB_OFFSET, OPSZ_PTR)));
PRE(ilist, next, SAVE_TO_DC_VIA_REG
(absolute, dcontext, reg_dr, reg_scratch,
to_priv ? APP_FLS_OFFSET : PRIV_FLS_OFFSET));
PRE(ilist, next, RESTORE_FROM_DC_VIA_REG
(absolute, dcontext, reg_dr, reg_scratch,
to_priv ? PRIV_FLS_OFFSET : APP_FLS_OFFSET));
PRE(ilist, next, XINST_CREATE_store
(dcontext, opnd_create_far_base_disp
(SEG_TLS, REG_NULL, REG_NULL, 0, FLS_DATA_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(reg_scratch)));
/* We swap TEB->ReservedForNtRpc as well. Hopefully there won't be many
* more we'll have to swap.
*/
PRE(ilist, next, XINST_CREATE_load
(dcontext, opnd_create_reg(reg_scratch), opnd_create_far_base_disp
(SEG_TLS, REG_NULL, REG_NULL, 0, NT_RPC_TIB_OFFSET, OPSZ_PTR)));
PRE(ilist, next, SAVE_TO_DC_VIA_REG
(absolute, dcontext, reg_dr, reg_scratch,
to_priv ? APP_RPC_OFFSET : PRIV_RPC_OFFSET));
PRE(ilist, next, RESTORE_FROM_DC_VIA_REG
(absolute, dcontext, reg_dr, reg_scratch,
to_priv ? PRIV_RPC_OFFSET : APP_RPC_OFFSET));
PRE(ilist, next, XINST_CREATE_store
(dcontext, opnd_create_far_base_disp
(SEG_TLS, REG_NULL, REG_NULL, 0, NT_RPC_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(reg_scratch)));
}
# endif /* CLIENT_INTERFACE */
#endif /* WINDOWS */
/***************************************************************************/
/* THREAD-PRIVATE/SHARED ROUTINE GENERATION */
/***************************************************************************/
/* Export this in instr.h if it becomes useful elsewhere */
#ifdef X86
# ifdef X64
# ifdef WINDOWS
# define OPND_ARG1 opnd_create_reg(REG_RCX)
# else
# define OPND_ARG1 opnd_create_reg(REG_RDI)
# endif /* Win/Unix */
# else
# define OPND_ARG1 OPND_CREATE_MEM32(REG_ESP, 4)
# endif /* 64/32-bit */
#elif defined(ARM)
# define OPND_ARG1 opnd_create_reg(DR_REG_R0)
#endif /* X86/ARM */
/* register for holding dcontext on fcache enter/return */
#define REG_DCTXT SCRATCH_REG5
/* append instructions to setup fcache target
* if (!absolute)
* # put target somewhere we can be absolute about
* RESTORE_FROM_UPCONTEXT next_tag_OFFSET,%xax
* if (shared)
* mov %xax,fs:xax_OFFSET
* endif
* endif
*/
static void
append_setup_fcache_target(dcontext_t *dcontext, instrlist_t *ilist,
bool absolute, bool shared)
{
if (absolute)
return;
/* put target into special slot that we can be absolute about */
APP(ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG0, NEXT_TAG_OFFSET));
if (shared) {
APP(ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG0, FCACHE_ENTER_TARGET_SLOT));
} else {
#ifdef WINDOWS
/* absolute into main dcontext (not one in REG_DCTXT) */
APP(ilist, instr_create_save_to_dcontext(dcontext, SCRATCH_REG0,
NONSWAPPED_SCRATCH_OFFSET));
#else
/* no special scratch slot! */
ASSERT_NOT_IMPLEMENTED(false);
#endif /* !WINDOWS */
}
}
/* append instructions to jump to target in code cache
* ifdef X64 and (target is x86 mode)
* # we can't indirect through a register since we couldn't restore
* # the high bits (PR 283152)
* mov gencode-jmp86-value, fs:xbx_OFFSET
* far jmp to next instr, stored w/ 32-bit cs selector in fs:xbx_OFFSET
* endif
*
* # jump indirect through dcontext->next_tag, set by dispatch()
* if (absolute)
* JUMP_VIA_DCONTEXT next_tag_OFFSET
* else
* if (shared)
* jmp *fs:xax_OFFSET
* else
* JUMP_VIA_DCONTEXT nonswapped_scratch_OFFSET
* endif
* endif
*/
static void
append_jmp_to_fcache_target(dcontext_t *dcontext, instrlist_t *ilist,
generated_code_t *code,
bool absolute, bool shared, patch_list_t *patch
_IF_X64(byte **jmp86_store_addr)
_IF_X64(byte **jmp86_target_addr))
{
#ifdef X86_64
if (GENCODE_IS_X86(code->gencode_mode)) {
instr_t *label = INSTR_CREATE_label(dcontext);
instr_t *store;
/* We must use an indirect jmp (far direct are illegal in x64) and
* we can't indirect through a register since we couldn't restore the
* high bits (PR 283152) so we write the 6-byte far address to TLS.
*/
/* AMD only supports 32-bit address for far jmp */
store = XINST_CREATE_store(dcontext,
OPND_TLS_FIELD_SZ(TLS_SLOT_REG1, OPSZ_4),
OPND_CREATE_INT32(0/*placeholder*/));
APP(ilist, store);
APP(ilist, XINST_CREATE_store(dcontext,
OPND_TLS_FIELD_SZ(TLS_SLOT_REG1+4, OPSZ_2),
OPND_CREATE_INT16((ushort)CS32_SELECTOR)));
APP(ilist, INSTR_CREATE_jmp_far_ind(dcontext,
OPND_TLS_FIELD_SZ(TLS_SLOT_REG1, OPSZ_6)));
APP(ilist, label);
/* We need a patch that involves two instrs, which is not supported,
* so we get both addresses involved into local vars and do the patch
* by hand after emitting.
*/
add_patch_marker(patch, store, PATCH_ASSEMBLE_ABSOLUTE,
-4 /* 4 bytes from end */, (ptr_uint_t*)jmp86_store_addr);
add_patch_marker(patch, label, PATCH_ASSEMBLE_ABSOLUTE,
0 /* start of label */, (ptr_uint_t*)jmp86_target_addr);
}
#endif /* X64 */
/* Jump indirect through next_tag. Dispatch set this value with
* where we want to go next in the fcache_t.
*/
if (absolute) {
APP(ilist, instr_create_jump_via_dcontext(dcontext, NEXT_TAG_OFFSET));
} else {
if (shared) {
/* next_tag placed into tls slot earlier in this routine */
APP(ilist,
XINST_CREATE_jump_mem(dcontext,
OPND_TLS_FIELD(FCACHE_ENTER_TARGET_SLOT)));
} else {
#ifdef WINDOWS
/* FIXME: we could just use tls, right? no real need for the "shared"
* parameter?
*/
/* need one absolute ref using main dcontext (not one in edi):
* it's the final jmp, using the special slot we set up earlier
*/
APP(ilist, instr_create_jump_via_dcontext(dcontext,
NONSWAPPED_SCRATCH_OFFSET));
#else /* !WINDOWS */
/* no special scratch slot! */
ASSERT_NOT_IMPLEMENTED(false);
#endif /* !WINDOWS */
}
}
}
/* Our context switch to and from the fragment cache are arranged such
* that there is no persistent state kept on the dstack, allowing us to
* start with a clean slate on exiting the cache. This eliminates the
* need to protect our dstack from inadvertent or malicious writes.
*
* We do not bother to save any DynamoRIO state, even the eflags. We clear
* them in fcache_return, assuming that a cleared state is always the
* proper value (df is never set across the cache, etc.)
*
* The code is split into several helper functions.
*
* # Used by dispatch to begin execution in fcache at dcontext->next_tag
* fcache_enter(dcontext_t *dcontext)
*
* if (!absolute)
* mov ARG1, SCRATCH_REG5 # dcontext param
* if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
* RESTORE_FROM_UPCONTEXT PROT_OFFSET, %xsi
* endif
* endif
*
* # append_setup_fcache_target
* if (!absolute)
* # put target somewhere we can be absolute about
* RESTORE_FROM_UPCONTEXT next_tag_OFFSET, SCRATCH_REG0
* if (shared)
* mov SCRATCH_REG0, fs:xax_OFFSET
* endif
* endif
*
* # append_call_exit_dr_hook
* if (EXIT_DR_HOOK != NULL && !dcontext->ignore_enterexit)
* if (!absolute)
* push %xdi
* push %xsi
* else
* # support for skipping the hook
* RESTORE_FROM_UPCONTEXT ignore_enterexit_OFFSET,%edi
* cmpl %edi,0
* jnz post_hook
* endif
* call EXIT_DR_HOOK # for x64 windows, reserve 32 bytes stack space for call
* if (!absolute)
* pop %xsi
* pop %xdi
* endif
* endif
*
* post_hook:
*
* # restore the original register state
*
* # append_restore_xflags
* RESTORE_FROM_UPCONTEXT xflags_OFFSET,%xax
* push %xax
* popf # restore eflags temporarily using dstack
*
* # append_restore_simd_reg
* if preserve_xmm_caller_saved
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+0*16,%xmm0
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+1*16,%xmm1
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+2*16,%xmm2
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+3*16,%xmm3
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+4*16,%xmm4
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+5*16,%xmm5
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+6*16,%xmm6 # 32-bit Linux
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+7*16,%xmm7 # 32-bit Linux
* endif
*
* # append_restore_gpr
* ifdef X64
* RESTORE_FROM_UPCONTEXT r8_OFFSET,%r8
* RESTORE_FROM_UPCONTEXT r9_OFFSET,%r9
* RESTORE_FROM_UPCONTEXT r10_OFFSET,%r10
* RESTORE_FROM_UPCONTEXT r11_OFFSET,%r11
* RESTORE_FROM_UPCONTEXT r12_OFFSET,%r12
* RESTORE_FROM_UPCONTEXT r13_OFFSET,%r13
* RESTORE_FROM_UPCONTEXT r14_OFFSET,%r14
* RESTORE_FROM_UPCONTEXT r15_OFFSET,%r15
* endif
* RESTORE_FROM_UPCONTEXT xax_OFFSET,%xax
* RESTORE_FROM_UPCONTEXT xbx_OFFSET,%xbx
* RESTORE_FROM_UPCONTEXT xcx_OFFSET,%xcx
* RESTORE_FROM_UPCONTEXT xdx_OFFSET,%xdx
* if (absolute || !TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
* RESTORE_FROM_UPCONTEXT xsi_OFFSET,%xsi
* endif
* if (absolute || TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
* RESTORE_FROM_UPCONTEXT xdi_OFFSET,%xdi
* endif
* RESTORE_FROM_UPCONTEXT xbp_OFFSET,%xbp
* RESTORE_FROM_UPCONTEXT xsp_OFFSET,%xsp
* if (!absolute)
* if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
* RESTORE_FROM_UPCONTEXT xsi_OFFSET,%xsi
* else
* RESTORE_FROM_UPCONTEXT xdi_OFFSET,%xdi
* endif
* endif
*
* # append_jmp_to_fcache_target
* ifdef X64 and (target is x86 mode)
* # we can't indirect through a register since we couldn't restore
* # the high bits (PR 283152)
* mov gencode-jmp86-value, fs:xbx_OFFSET
* far jmp to next instr, stored w/ 32-bit cs selector in fs:xbx_OFFSET
* endif
*
* # jump indirect through dcontext->next_tag, set by dispatch()
* if (absolute)
* JUMP_VIA_DCONTEXT next_tag_OFFSET
* else
* if (shared)
* jmp *fs:xax_OFFSET
* else
* JUMP_VIA_DCONTEXT nonswapped_scratch_OFFSET
* endif
* endif
*
* # now executing in fcache
*/
static byte *
emit_fcache_enter_common(dcontext_t *dcontext, generated_code_t *code,
byte *pc, bool absolute, bool shared)
{
int len;
instrlist_t ilist;
patch_list_t patch;
#ifdef X64
byte *jmp86_store_addr = NULL;
byte *jmp86_target_addr = NULL;
#endif /* X64 */
init_patch_list(&patch, absolute ? PATCH_TYPE_ABSOLUTE : PATCH_TYPE_INDIRECT_XDI);
instrlist_init(&ilist);
/* no support for absolute addresses on x64/ARM: we always use tls */
IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute && shared));
IF_ARM(ASSERT_NOT_IMPLEMENTED(!absolute && shared));
if (!absolute) {
/* grab gen routine's parameter dcontext and put it into edi */
APP(&ilist,
IF_X86_ELSE(XINST_CREATE_load, XINST_CREATE_move)
(dcontext, opnd_create_reg(SCRATCH_REG5), OPND_ARG1));
if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) {
IF_X86_ELSE({
APP(&ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG4, PROT_OFFS));
}, {
/* FIXME i#1551: SELFPROT is not supported on ARM */
ASSERT_NOT_REACHED();
});
}
}
append_setup_fcache_target(dcontext, &ilist, absolute, shared);
append_call_exit_dr_hook(dcontext, &ilist, absolute, shared);
#if defined(WINDOWS) && defined(CLIENT_INTERFACE)
/* i#249: isolate the PEB */
if (INTERNAL_OPTION(private_peb) && should_swap_peb_pointer()) {
preinsert_swap_peb(dcontext, &ilist, NULL, absolute, SCRATCH_REG5,
SCRATCH_REG0/*scratch*/, false/*to app*/);
}
#endif
/* restore the original register state */
append_restore_xflags(dcontext, &ilist, absolute);
append_restore_simd_reg(dcontext, &ilist, absolute);
append_restore_gpr(dcontext, &ilist, absolute);
append_jmp_to_fcache_target(dcontext, &ilist, code, absolute, shared, &patch
_IF_X64(&jmp86_store_addr)
_IF_X64(&jmp86_target_addr));
/* now encode the instructions */
len = encode_with_patch_list(dcontext, &patch, &ilist, pc);
ASSERT(len != 0);
#ifdef X64
if (GENCODE_IS_X86(code->gencode_mode)) {
/* Put the absolute address in place */
ASSERT(jmp86_target_addr != NULL && jmp86_store_addr != NULL);
ASSERT(CHECK_TRUNCATE_TYPE_uint((ptr_uint_t)jmp86_target_addr));
*((uint *)jmp86_store_addr) = (uint)(ptr_uint_t)jmp86_target_addr;
}
#endif
/* free the instrlist_t elements */
instrlist_clear(dcontext, &ilist);
return pc + len;
}
byte *
emit_fcache_enter(dcontext_t *dcontext, generated_code_t *code, byte *pc)
{
return emit_fcache_enter_common(dcontext, code, pc,
true/*absolute*/, false/*!shared*/);
}
/* Generate a shared prologue for grabbing the dcontext into XDI
TODO: Should be used by fcache_return and shared IBL routines,
but for now some assumptions are not quite the same.
Only assumption is that xcx cannot be touched (IBL expects looked up address)
if save_xdi we assume DCONTEXT_BASE_SPILL_SLOT can be clobbered
OUTPUT: xdi contains dcontext
if save_xdi DCONTEXT_BASE_SPILL_SLOT will contain saved value
FIXME: xdx is the spill slot -- switch over to xdx as base reg?
Have to measure perf effect first (case 5239)
00: mov xdi, tls_slot_scratch2 64 89 3d 0c 0f 00 00 mov %edi -> %fs:0xf0c
07: mov tls_slot_dcontext, xdi 64 8b 3d 14 0f 00 00 mov %fs:0xf14 -> %edi
if TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)
ASSERT_NOT_TESTED
endif
*/
void
insert_shared_get_dcontext(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where,
bool save_xdi)
{
/* needed to support grabbing the dcontext w/ shared cache */
if (save_xdi) {
PRE(ilist, where, SAVE_TO_TLS(dcontext, SCRATCH_REG5/*xdi/r5*/,
DCONTEXT_BASE_SPILL_SLOT));
}
PRE(ilist, where, RESTORE_FROM_TLS(dcontext, SCRATCH_REG5/*xdi/r5*/,
TLS_DCONTEXT_SLOT));
if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) {
#ifdef X86
bool absolute = false;
/* PR 224798: we could avoid extra indirection by storing
* unprotected_context_t in TLS_DCONTEXT_SLOT instead of dcontext_t
*/
ASSERT_NOT_TESTED();
/* we'd need a 3rd slot in order to nicely get unprot ptr into esi
* we can do it w/ only 2 slots by clobbering dcontext ptr
* (we could add base reg info to RESTORE_FROM_DC/SAVE_TO_DC and go
* straight through esi to begin w/ and subtract one instr (xchg)
*/
PRE(ilist, where, RESTORE_FROM_DC(dcontext, SCRATCH_REG5, PROT_OFFS));
PRE(ilist, where, INSTR_CREATE_xchg(dcontext, opnd_create_reg(SCRATCH_REG4),
opnd_create_reg(SCRATCH_REG5)));
PRE(ilist, where, SAVE_TO_DC(dcontext, SCRATCH_REG5, SCRATCH_REG4_OFFS));
PRE(ilist, where, RESTORE_FROM_TLS(dcontext, SCRATCH_REG5, TLS_DCONTEXT_SLOT));
#elif defined(ARM)
/* FIXMED i#1551: NYI on ARM */
ASSERT_NOT_REACHED();
#endif
}
}
/* restore XDI through TLS */
void
insert_shared_restore_dcontext_reg(dcontext_t *dcontext, instrlist_t *ilist,
instr_t *where)
{
PRE(ilist, where, RESTORE_FROM_TLS(dcontext, SCRATCH_REG5/*xdi/r5*/,
DCONTEXT_BASE_SPILL_SLOT));
}
/* append instructions to prepare for fcache return:
* i.e., far jump to switch mode, load dcontext, etc.
*
* # on X86
* ifdef X64 and (source is x86 mode)
* far direct jmp to next instr w/ 64-bit switch
* endif
*
* if (!absolute)
* mov %xdi,fs:xdx_OFFSET
* mov fs:dcontext,%xdi
* if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
* RESTORE_FROM_DCONTEXT PROT_OFFSET,%xdi
* xchg %xsi,%xdi
* SAVE_TO_UPCONTEXT %xdi,xsi_OFFSET
* mov fs:dcontext,%xdi
* endif
* # get xax and xdi into their real slots, via xbx
* SAVE_TO_UPCONTEXT %xbx,xbx_OFFSET
* mov fs:xax_OFFSET,%xbx
* SAVE_TO_UPCONTEXT %xbx,xax_OFFSET
* mov fs:xdx_OFFSET,%xbx
* SAVE_TO_UPCONTEXT %xbx,xdi_OFFSET
* endif
*/
static void
append_prepare_fcache_return(dcontext_t *dcontext, instrlist_t *ilist,
bool absolute, bool shared)
{
#ifdef X86_64
if (GENCODE_IS_X86(code->gencode_mode)) {
instr_t *label = INSTR_CREATE_label(dcontext);
instr_t *ljmp = INSTR_CREATE_jmp_far
(dcontext, opnd_create_far_instr(CS64_SELECTOR, label));
instr_set_x86_mode(ljmp, true/*x86*/);
APP(ilist, ljmp);
APP(ilist, label);
}
#endif /* X86_64 */
if (absolute)
return;
/* only support non-absolute w/ shared cache */
ASSERT_NOT_IMPLEMENTED(shared);
/* xax is in 1 scratch slot, so we have to use a 2nd scratch
* slot in order to get dcontext into xdi
*/
APP(ilist, SAVE_TO_TLS(dcontext, REG_DCTXT, DCONTEXT_BASE_SPILL_SLOT));
APP(ilist, RESTORE_FROM_TLS(dcontext, REG_DCTXT, TLS_DCONTEXT_SLOT));
if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) {
#ifdef X86
/* we'd need a 3rd slot in order to nicely get unprot ptr into xsi
* we can do it w/ only 2 slots by clobbering dcontext ptr
* (we could add base reg info to RESTORE_FROM_DC/SAVE_TO_DC and go
* straight through xsi to begin w/ and subtract one instr (xchg)
*/
ASSERT_NOT_TESTED();
APP(ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG5, PROT_OFFS));
APP(ilist, INSTR_CREATE_xchg(dcontext, opnd_create_reg(SCRATCH_REG4),
opnd_create_reg(SCRATCH_REG5)));
APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG5, SCRATCH_REG4_OFFS));
APP(ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG5, TLS_DCONTEXT_SLOT));
#elif defined(ARM)
/* FIXME i#1551: NYI on ARM */
ASSERT_NOT_REACHED();
#endif /* X86/ARM */
}
}
static void
append_call_dispatch(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
/* call central dispatch routine */
/* for x64 linux we could optimize and avoid the "mov rdi, rdi" */
dr_insert_call((void *)dcontext, ilist, NULL/*append*/,
(void *)dispatch, 1,
absolute ?
OPND_CREATE_INTPTR((ptr_int_t)dcontext) : opnd_create_reg(REG_DCTXT));
/* dispatch() shouldn't return! */
insert_reachable_cti(dcontext, ilist, NULL, vmcode_get_start(),
(byte *)unexpected_return, true/*jmp*/, false/*!precise*/,
DR_REG_R11/*scratch*/, NULL);
}
/*
* # fcache_return: context switch back to DynamoRIO.
* # Invoked via
* # a) from the fcache via a fragment exit stub,
* # b) from indirect_branch_lookup().
* # Invokes dispatch() with a clean dstack.
* # Assumptions:
* # 1) app's value in xax/r0 already saved in dcontext.
* # 2) xax/r0 holds the linkstub ptr
* #
*
* fcache_return:
* # append_prepare_fcache_return
* ifdef X64 and (source is x86 mode)
* far direct jmp to next instr w/ 64-bit switch
* endif
*
* if (!absolute)
* mov %xdi,fs:xdx_OFFSET
* mov fs:dcontext,%xdi
* if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
* RESTORE_FROM_DCONTEXT PROT_OFFSET,%xdi
* xchg %xsi,%xdi
* SAVE_TO_UPCONTEXT %xdi,xsi_OFFSET
* mov fs:dcontext,%xdi
* endif
* endif
*
* # append_save_gpr
* if (!absolute)
* # get xax and xdi into their real slots, via xbx
* SAVE_TO_UPCONTEXT %xbx,xbx_OFFSET
* mov fs:xax_OFFSET,%xbx
* SAVE_TO_UPCONTEXT %xbx,xax_OFFSET
* mov fs:xdx_OFFSET,%xbx
* SAVE_TO_UPCONTEXT %xbx,xdi_OFFSET
* endif
*
* # save the current register state to context->regs
* # xax already in context
*
* if (absolute)
* SAVE_TO_UPCONTEXT %xbx,xbx_OFFSET
* endif
* SAVE_TO_UPCONTEXT %xcx,xcx_OFFSET
* SAVE_TO_UPCONTEXT %xdx,xdx_OFFSET
* if (absolute || !TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
* SAVE_TO_UPCONTEXT %xsi,xsi_OFFSET
* endif
* if (absolute)
* SAVE_TO_UPCONTEXT %xdi,xdi_OFFSET
* endif
* SAVE_TO_UPCONTEXT %xbp,xbp_OFFSET
* SAVE_TO_UPCONTEXT %xsp,xsp_OFFSET
* ifdef X64
* SAVE_TO_UPCONTEXT %r8,r8_OFFSET
* SAVE_TO_UPCONTEXT %r9,r9_OFFSET
* SAVE_TO_UPCONTEXT %r10,r10_OFFSET
* SAVE_TO_UPCONTEXT %r11,r11_OFFSET
* SAVE_TO_UPCONTEXT %r12,r12_OFFSET
* SAVE_TO_UPCONTEXT %r13,r13_OFFSET
* SAVE_TO_UPCONTEXT %r14,r14_OFFSET
* SAVE_TO_UPCONTEXT %r15,r15_OFFSET
* endif
*
* # append_save_simd_reg
* if preserve_xmm_caller_saved
* SAVE_TO_UPCONTEXT %xmm0,xmm_OFFSET+0*16
* SAVE_TO_UPCONTEXT %xmm1,xmm_OFFSET+1*16
* SAVE_TO_UPCONTEXT %xmm2,xmm_OFFSET+2*16
* SAVE_TO_UPCONTEXT %xmm3,xmm_OFFSET+3*16
* SAVE_TO_UPCONTEXT %xmm4,xmm_OFFSET+4*16
* SAVE_TO_UPCONTEXT %xmm5,xmm_OFFSET+5*16
* SAVE_TO_UPCONTEXT %xmm6,xmm_OFFSET+6*16 # 32-bit Linux
* SAVE_TO_UPCONTEXT %xmm7,xmm_OFFSET+7*16 # 32-bit Linux
* endif
*
* # switch to clean dstack
* RESTORE_FROM_DCONTEXT dstack_OFFSET,%xsp
*
* # append_save_clear_xflags
* # now save eflags -- too hard to do without a stack!
* pushf # push eflags on stack
* pop %xbx # grab eflags value
* SAVE_TO_UPCONTEXT %xbx,xflags_OFFSET # save eflags value
*
* # clear eflags now to avoid app's eflags messing up our ENTER_DR_HOOK
* # FIXME: this won't work at CPL0 if we ever run there!
* push 0
* popf
*
* # append_call_enter_dr_hook
* if (ENTER_DR_HOOK != NULL && !dcontext->ignore_enterexit)
* # don't bother to save any registers around call except for xax
* # and xcx, which holds next_tag
* push %xcx
* if (!absolute)
* push %xdi
* push %xsi
* endif
* push %xax
* if (absolute)
* # support for skipping the hook (note: 32-bits even on x64)
* RESTORE_FROM_UPCONTEXT ignore_enterexit_OFFSET,%edi
* cmp %edi,0
* jnz post_hook
* endif
* # for x64 windows, reserve 32 bytes stack space for call prior to call
* call ENTER_DR_HOOK
*
* post_hook:
* pop %xax
* if (!absolute)
* pop %xsi
* pop %xdi
* endif
* pop %xcx
* endif
*
* # save last_exit, currently in eax, into dcontext->last_exit
* SAVE_TO_DCONTEXT %xax,last_exit_OFFSET
*
* .ifdef WINDOWS && CLIENT_INTERFACE
* swap_peb
* .endif
*
* .ifdef SIDELINE
* # clear cur-trace field so we don't think cur trace is still running
* movl $0, _sideline_trace
* .endif
*
* # call central dispatch routine w/ dcontext as an argument
* if (absolute)
* push <dcontext>
* else
* push %xdi # for x64, mov %xdi, ARG1
* endif
* call dispatch # for x64 windows, reserve 32 bytes stack space for call
* # dispatch() shouldn't return!
* jmp unexpected_return
*/
/* N.B.: this routine is used to generate both the regular fcache_return
* and a slightly different copy that is used for the miss/unlinked paths
* for indirect_branch_lookup for self-protection.
* ibl_end should be true only for that end of the lookup routine.
*
* If linkstub != NULL, used for coarse fragments, this routine assumes that:
* - app xax is still in %xax
* - next target pc is in DIRECT_STUB_SPILL_SLOT tls
* - linkstub is the linkstub_t to pass back to dispatch
* - if coarse_info:
* - app xcx is in MANGLE_XCX_SPILL_SLOT
* - source coarse info is in %xcx
*
* We assume this routine does not use TLS slot FLOAT_PC_STATE_SLOT (TLS_SLOT_REG1).
*/
bool
append_fcache_return_common(dcontext_t *dcontext, generated_code_t *code,
instrlist_t *ilist, bool ibl_end,
bool absolute, bool shared, linkstub_t *linkstub,
bool coarse_info)
{
bool instr_targets;
/* no support for absolute addresses on x64: we always use tls */
IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute && shared));
/* currently linkstub is only used for coarse-grain exits */
ASSERT(linkstub == NULL || !absolute);
append_prepare_fcache_return(dcontext, ilist, absolute, shared);
append_save_gpr(dcontext, ilist, ibl_end, absolute, code, linkstub, coarse_info);
append_save_simd_reg(dcontext, ilist, absolute);
/* Switch to a clean dstack as part of our scheme to avoid state kept
* unprotected across cache executions.
* FIXME: this isn't perfect: we switch to the dstack BEFORE we call
* the entrance hook that will be used to coordinate other threads,
* so if our hook suspends all other threads to protect vs cross-thread
* attacks, the dstack is not perfectly protected.
*/
APP(ilist, RESTORE_FROM_DC(dcontext, REG_XSP, DSTACK_OFFSET));
append_save_clear_xflags(dcontext, ilist, absolute);
instr_targets = append_call_enter_dr_hook(dcontext, ilist, ibl_end, absolute);
/* save last_exit, currently in scratch_reg0 into dcontext->last_exit */
APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG0, LAST_EXIT_OFFSET));
#if defined(WINDOWS) && defined(CLIENT_INTERFACE)
/* i#249: isolate the PEB */
if (INTERNAL_OPTION(private_peb) && should_swap_peb_pointer()) {
preinsert_swap_peb(dcontext, ilist, NULL, absolute, SCRATCH_REG5,
SCRATCH_REG0/*scratch*/, true/*to priv*/);
}
#endif /* WINDOWS && CLIENT_INTERFACE */
#ifdef SIDELINE
if (dynamo_options.sideline) {
/* clear cur-trace field so we don't think cur trace is still running */
/* PR 248210: unsupported feature on x64 */
IF_X64(ASSERT_NOT_IMPLEMENTED(false)); /* PR 244737: fix abs address */
APP(ilist,
XINST_CREATE_store(dcontext,
OPND_CREATE_MEM32(REG_NULL, (int)&sideline_trace),
OPND_CREATE_INT32(0)));
}
#endif
append_call_dispatch(dcontext, ilist, absolute);
return instr_targets;
}
byte *
emit_fcache_return(dcontext_t *dcontext, generated_code_t *code, byte *pc)
{
bool instr_targets;
instrlist_t ilist;
instrlist_init(&ilist);
instr_targets = append_fcache_return_common(dcontext, code, &ilist,
false/*!ibl_end*/,
true/*absolute*/, false/*!shared*/,
NULL, false/*not coarse*/);
/* now encode the instructions */
pc = instrlist_encode(dcontext, &ilist, pc, instr_targets);
ASSERT(pc != NULL);
/* free the instrlist_t elements */
instrlist_clear(dcontext, &ilist);
return pc;
}
byte *
emit_fcache_enter_shared(dcontext_t *dcontext, generated_code_t *code, byte *pc)
{
return emit_fcache_enter_common(dcontext, code, pc,
false/*through xdi*/, true/*shared*/);
}
byte *
emit_fcache_return_shared(dcontext_t *dcontext, generated_code_t *code, byte *pc)
{
bool instr_targets;
instrlist_t ilist;
instrlist_init(&ilist);
instr_targets = append_fcache_return_common(dcontext, code, &ilist, false/*!ibl_end*/,
false/*through xdi*/, true/*shared*/,
NULL, false/*not coarse*/);
/* now encode the instructions */
pc = instrlist_encode(dcontext, &ilist, pc, instr_targets);
ASSERT(pc != NULL);
/* free the instrlist_t elements */
instrlist_clear(dcontext, &ilist);
return pc;
}