| /* ********************************************************** |
| * Copyright (c) 2010-2014 Google, Inc. All rights reserved. |
| * Copyright (c) 2000-2010 VMware, Inc. All rights reserved. |
| * **********************************************************/ |
| |
| /* |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * * Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * * Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * * Neither the name of VMware, Inc. nor the names of its contributors may be |
| * used to endorse or promote products derived from this software without |
| * specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE |
| * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH |
| * DAMAGE. |
| */ |
| |
| /* Copyright (c) 2003-2007 Determina Corp. */ |
| /* Copyright (c) 2001-2003 Massachusetts Institute of Technology */ |
| /* Copyright (c) 2000-2001 Hewlett-Packard Company */ |
| |
| /* file "emit_utils_shared.c" */ |
| /* The Pentium processors maintain cache consistency in hardware, so we don't |
| * worry about getting stale cache entries. |
| */ |
| /* FIXME i#1551: flush code cache after update it on ARM because the hardware |
| * does not maintain cache consistency in hardware. |
| */ |
| |
| #include "../globals.h" |
| #include "../link.h" |
| #include "../fragment.h" |
| #include "../fcache.h" |
| #include "../emit.h" |
| |
| #include "arch.h" |
| #include "instr.h" |
| #include "instr_create.h" |
| #include "instrlist.h" |
| #include "instrument.h" /* for dr_insert_call() */ |
| #include "proc.h" |
| #include <string.h> /* for memcpy */ |
| #include "decode.h" |
| #include "decode_fast.h" |
| #include "x86/decode_private.h" |
| #ifdef DEBUG |
| # include "disassemble.h" |
| #endif |
| #include <limits.h> /* for UCHAR_MAX */ |
| #include "../perscache.h" |
| |
| #ifdef VMX86_SERVER |
| # include "vmkuw.h" |
| #endif |
| |
| /* fragment_t fields */ |
| /* CAUTION: if TAG_OFFS changes from 0, must change indirect exit stub! */ |
| #define FRAGMENT_START_PC_OFFS (offsetof(fragment_t, start_pc)) |
| #define FRAGMENT_COUNTER_OFFS (offsetof(fragment_t, hot_counter)) |
| #define FRAGMENT_PREFIX_SIZE_OFFS (offsetof(fragment_t, prefix_size)) |
| |
| #ifdef TRACE_HEAD_CACHE_INCR |
| /* linkstub_t field */ |
| # define LINKSTUB_TARGET_FRAG_OFFS (offsetof(direct_linkstub_t, target_fragment)) |
| #endif |
| |
| #ifdef PROFILE_LINKCOUNT |
| # define LINKSTUB_COUNT_OFFS (offsetof(linkstub_t, count)) |
| #endif |
| |
| |
| /* N.B.: I decided to not keep supporting DCONTEXT_IN_EDI |
| * If we really want it later we can add it, it's a pain to keep |
| * maintaining it with every change here |
| */ |
| #ifdef DCONTEXT_IN_EDI |
| # error DCONTEXT_IN_EDI Not Implemented |
| #endif |
| |
| /* make code more readable by shortening long lines |
| * we mark all as meta to avoid client interface asserts |
| */ |
| #define POST instrlist_meta_postinsert |
| #define PRE instrlist_meta_preinsert |
| #define APP instrlist_meta_append |
| |
| /** |
| ** CAUTION! |
| ** |
| ** The following definitions and routines are highly dependent upon |
| ** definitions made in x86.asm. Do NOT change any constants or code |
| ** without first consulting that file. |
| ** |
| **/ |
| |
| /*************************************************************************** |
| *************************************************************************** |
| ** EXIT STUB |
| ** |
| ** WARNING: all exit stubs must support atomic linking and unlinking, |
| ** meaning a link/unlink operation must involve a single store! |
| ** There is an exception: a first-time link (detected using a sentinel |
| ** LINKCOUNT_NEVER_LINKED_SENTINEL placed where the unlinked entry |
| ** code will go once linked) does not need to be atomic. |
| **/ |
| |
| /* FIXME i#1551: update the comment to x86/arm in this file */ |
| /* |
| direct branch exit_stub: |
| 5x8 mov %xax, xax_offs(&dcontext) or tls |
| #if defined(PROFILE_LINKCOUNT) (PR 248210: x64 not supported) |
| | 1 lahf |
| | 3 seto %al |
| |#if !defined(LINKCOUNT_64_BITS) |
| | 6 inc l->count |
| |#else |
| | 7 add $1,l->count |
| | 7 adc $0,l->count+4 |
| |#endif |
| | 2 add $0x7f,%al |
| | 1 sahf |
| #endif |
| 5x10 mov &linkstub, %xax |
| 5 jmp target addr |
| #if defined(PROFILE_LINKCOUNT) (PR 248210: x64 not supported) |
| |unlinked entry point: |
| | 5 movl %eax, eax_offs(&dcontext) |
| | 5 movl &linkstub, %eax |
| | 5 jmp fcache_return |
| | |
| | Notes: we link/unlink by modifying the 1st jmp to either target unlinked |
| | entry point or the target fragment. When we link for the first time |
| | we try to remove the eflags save/restore, shifting the 1st jmp up (the |
| | space between it and unlinked entry just becomes junk). |
| #endif |
| |
| indirect branch exit_stub (only used if -indirect_stubs): |
| 6x9 mov %xbx, xbx_offs(&dcontext) or tls |
| 5x11 mov &linkstub, %xbx |
| 5 jmp indirect_branch_lookup |
| |
| indirect branches use xbx so that the flags can be saved into xax using |
| the lahf instruction! |
| xref PR 249775 on lahf support on x64. |
| |
| for PROFILE_LINKCOUNT, the count increment is performed inside the |
| hashtable lookup (in both linked and unlinked paths) both since the flags |
| are saved there for the linked path and to save space in stubs |
| |
| also see emit_inline_ibl_stub() below |
| |
| */ |
| |
| /* DIRECT_EXIT_STUB_SIZE is in arch_exports.h */ |
| #define STUB_DIRECT_SIZE(flags) DIRECT_EXIT_STUB_SIZE(flags) |
| |
| /* for -thread_private, we're relying on the fact that |
| * SIZE32_MOV_XBX_TO_TLS == SIZE32_MOV_XBX_TO_ABS, and that |
| * x64 always uses tls |
| */ |
| #define STUB_INDIRECT_SIZE32 \ |
| (SIZE32_MOV_XBX_TO_TLS + SIZE32_MOV_PTR_IMM_TO_XAX + JMP_LONG_LENGTH) |
| #define STUB_INDIRECT_SIZE64 \ |
| (SIZE64_MOV_XBX_TO_TLS + SIZE64_MOV_PTR_IMM_TO_XAX + JMP_LONG_LENGTH) |
| #define STUB_INDIRECT_SIZE(flags) \ |
| (FRAG_IS_32(flags) ? STUB_INDIRECT_SIZE32 : STUB_INDIRECT_SIZE64) |
| |
| /* STUB_COARSE_DIRECT_SIZE is in arch_exports.h */ |
| #define STUB_COARSE_INDIRECT_SIZE(flags) (STUB_INDIRECT_SIZE(flags)) |
| |
| #ifndef LINKCOUNT_64_BITS |
| # define LINKCOUNT_INCSIZE (6) |
| #else |
| # define LINKCOUNT_INCSIZE (7+7) |
| #endif |
| #define LINKCOUNT_EFLAGS_SAVE (3+1) |
| #define LINKCOUNT_EFLAGS_RESTORE (2+1) |
| #define LINKCOUNT_FLAGSIZE (LINKCOUNT_EFLAGS_SAVE + LINKCOUNT_EFLAGS_RESTORE) |
| |
| #define LINKCOUNT_DIRECT_EXTRA(flags) \ |
| (LINKCOUNT_INCSIZE + LINKCOUNT_FLAGSIZE + STUB_DIRECT_SIZE(flags)) |
| #define LINKCOUNT_UNLINKED_ENTRY(flags) \ |
| (LINKCOUNT_INCSIZE + LINKCOUNT_FLAGSIZE + STUB_DIRECT_SIZE(flags)) |
| |
| /* used to distinguish a never-linked direct exit -- once linked this |
| * will be replaced by the beginning of the unlink entry point, which is |
| * a save of xax, which will never look like this. we choose nops to |
| * avoid complicating our disassembly routines. |
| */ |
| #define LINKCOUNT_NEVER_LINKED_SENTINEL 0x90909090 |
| |
| /* Return size in bytes required for an exit stub with specified |
| * target and FRAG_ flags |
| */ |
| int |
| exit_stub_size(dcontext_t *dcontext, cache_pc target, uint flags) |
| { |
| if (TEST(FRAG_COARSE_GRAIN, flags)) { |
| /* For coarse: bb building points at bb ibl, and then insert_exit_stub |
| * changes that to the appropriate coarse prefix. So the emit() calls to |
| * this routine pass in a real ibl. But any later calls, e.g. for |
| * disassembly, that ask linkstub_size() will call EXIT_TARGET_TAG() which |
| * calls indirect_linkstub_target() which returns get_coarse_ibl_prefix(): |
| * which then is not recognized as indirect by this routine! |
| * Note that coarse_indirect_stub_jmp_target() derefs the prefix: |
| * should we require callers who have stub pc to call that instead of us |
| * de-referencing? |
| */ |
| target = coarse_deref_ibl_prefix(dcontext, target); |
| } |
| if (is_indirect_branch_lookup_routine(dcontext, target)) { |
| /* indirect branch */ |
| |
| /* FIXME: Since we don't have the stub flags we'll lookup the |
| * target routine's template in a very roundabout fashion here |
| * by dispatching on the ibl_routine entry point |
| */ |
| ibl_code_t *ibl_code; |
| |
| ibl_type_t ibl_type; |
| IF_X64(gencode_mode_t mode;) |
| DEBUG_DECLARE(bool is_ibl = ) |
| get_ibl_routine_type_ex(dcontext, target, &ibl_type _IF_X64(&mode)); |
| ASSERT(is_ibl); |
| IF_X64(ASSERT(mode == FRAGMENT_GENCODE_MODE(flags) || |
| (DYNAMO_OPTION(x86_to_x64) && mode == GENCODE_X86_TO_X64))); |
| ibl_code = get_ibl_routine_code_ex(dcontext, ibl_type.branch_type, flags |
| _IF_X64(mode)); |
| |
| if (!EXIT_HAS_STUB(ibltype_to_linktype(ibl_code->branch_type), |
| IBL_FRAG_FLAGS(ibl_code))) |
| return 0; |
| |
| if (TEST(FRAG_COARSE_GRAIN, flags)) { |
| IF_WINDOWS(ASSERT(!is_shared_syscall_routine(dcontext, target))); |
| /* keep in synch w/ coarse_indirect_stub_size() */ |
| return (STUB_COARSE_INDIRECT_SIZE(flags)); |
| } |
| |
| #ifdef WINDOWS |
| if (is_shared_syscall_routine(dcontext, target)) { |
| return INTERNAL_OPTION(shared_syscalls_fastpath) ? 5 : |
| STUB_INDIRECT_SIZE(flags); |
| } |
| #endif |
| |
| if (ibl_code->ibl_head_is_inlined) |
| return ibl_code->inline_stub_length; |
| else |
| return (STUB_INDIRECT_SIZE(flags)); |
| } else { |
| /* direct branch */ |
| if (TEST(FRAG_COARSE_GRAIN, flags)) |
| return (STUB_COARSE_DIRECT_SIZE(flags)); |
| #ifdef PROFILE_LINKCOUNT |
| if (dynamo_options.profile_counts && (flags & FRAG_IS_TRACE) != 0) |
| return (STUB_DIRECT_SIZE(flags) + LINKCOUNT_DIRECT_EXTRA(flags)); |
| else { |
| #endif |
| return (STUB_DIRECT_SIZE(flags)); |
| #ifdef PROFILE_LINKCOUNT |
| } |
| #endif |
| } |
| } |
| |
| static bool |
| is_patchable_exit_stub_helper(dcontext_t *dcontext, cache_pc ltarget, |
| ushort lflags, uint fflags) |
| { |
| if (LINKSTUB_INDIRECT(lflags)) { |
| /*indirect */ |
| if (!DYNAMO_OPTION(indirect_stubs)) |
| return false; |
| if ( |
| #ifdef WINDOWS |
| !is_shared_syscall_routine(dcontext, ltarget) && |
| #endif |
| get_ibl_routine_code(dcontext, extract_branchtype(lflags), fflags) |
| ->ibl_head_is_inlined) { |
| return !DYNAMO_OPTION(atomic_inlined_linking); |
| } else { |
| return true; |
| } |
| } else { |
| /* direct */ |
| ASSERT(LINKSTUB_DIRECT(lflags)); |
| #if defined(PROFILE_LINKCOUNT) || defined(TRACE_HEAD_CACHE_INCR) |
| return true; |
| #else |
| return false; |
| #endif |
| } |
| } |
| |
| bool |
| is_patchable_exit_stub(dcontext_t *dcontext, linkstub_t *l, fragment_t *f) |
| { |
| return is_patchable_exit_stub_helper(dcontext, EXIT_TARGET_TAG(dcontext, f, l), |
| l->flags, f->flags); |
| } |
| |
| bool |
| is_exit_cti_stub_patchable(dcontext_t *dcontext, instr_t *inst, uint frag_flags) |
| { |
| app_pc target; |
| /* we figure out what the linkstub flags should be |
| * N.B.: we have to be careful to match the LINKSTUB_ macros |
| */ |
| ushort lflags = (ushort) instr_exit_branch_type(inst); |
| ASSERT_TRUNCATE(lflags, ushort, instr_exit_branch_type(inst)); |
| ASSERT(instr_is_exit_cti(inst)); |
| target = instr_get_branch_target_pc(inst); |
| if (is_indirect_branch_lookup_routine(dcontext, target)) { |
| lflags |= LINK_INDIRECT; |
| } else { |
| lflags |= LINK_DIRECT; |
| } |
| return is_patchable_exit_stub_helper(dcontext, target, lflags, frag_flags); |
| } |
| |
| uint |
| bytes_for_exitstub_alignment(dcontext_t *dcontext, linkstub_t *l, |
| fragment_t *f, byte *startpc) |
| { |
| if (is_patchable_exit_stub(dcontext, l, f)) { |
| /* assumption - we only hot patch the ending jmp of the exit stub |
| * (and that exit stub size returns the right values) */ |
| ptr_uint_t shift = ALIGN_SHIFT_SIZE |
| (startpc + |
| exit_stub_size(dcontext, EXIT_TARGET_TAG(dcontext, f, l), f->flags) - |
| EXIT_STUB_PATCH_OFFSET, |
| EXIT_STUB_PATCH_SIZE, PAD_JMPS_ALIGNMENT); |
| #ifdef PROFILE_LINKCOUNT |
| /* assumption doesn't hold because of the optimize ... */ |
| /* FIXME : once this is implemented re-enable the ifdefed out stats |
| * in emit_fragment_common */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| #endif |
| IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint(shift))); |
| return (uint) shift; |
| } |
| return 0; |
| } |
| |
| /* Returns an upper bound on the number of bytes that will be needed to add |
| * this fragment to a trace */ |
| uint |
| extend_trace_pad_bytes(fragment_t *add_frag) |
| { |
| /* FIXME : this is a poor estimate, we could do better by looking at the |
| * linkstubs and checking if we are inlining ibl, but since this is just |
| * used by monitor.c for a max size check should be fine to overestimate |
| * we'll just end up with slightly shorter max size traces */ |
| /* we don't trace through traces in normal builds, so don't worry about |
| * number of exits (FIXME this also assumes bbs don't trace through |
| * conditional or indirect branches) */ |
| ASSERT_NOT_IMPLEMENTED(!TEST(FRAG_IS_TRACE, add_frag->flags)); |
| /* Also, if -pad_jmps_shift_bb we assume that we don't need to remove |
| * any nops from fragments added to traces since there shouldn't be any if |
| * we only add bbs (nop_pad_ilist has an assert that verifies we don't add |
| * any nops to bbs when -pad_jmps_shift_bb without marking as CANNOT_BE_TRACE, |
| * so here we also verify that we only add bbs) - Xref PR 215179, UNIX syscall |
| * fence exits and CLIENT_INTERFACE added/moved exits can lead to bbs with |
| * additional hot_patchable locations. We mark such bb fragments as CANNOT_BE_TRACE |
| * in nop_pad_ilist() if -pad_jmps_mark_no_trace is set or assert otherwise to avoid |
| * various difficulties so should not see them here. */ |
| /* A standard bb has at most 2 patchable locations (ends in conditional or ends |
| * in indirect that is promoted to inlined). */ |
| return 2*MAX_PAD_SIZE; |
| } |
| |
| /* return startpc shifted by the necessary bytes to pad patchable jmps of the |
| * exit stub to proper alignment */ |
| byte * |
| pad_for_exitstub_alignment(dcontext_t *dcontext, linkstub_t *l, |
| fragment_t *f, byte *startpc) |
| { |
| uint shift; |
| ASSERT(PAD_FRAGMENT_JMPS(f->flags)); /* shouldn't call this otherwise */ |
| |
| shift = bytes_for_exitstub_alignment(dcontext, l, f, startpc); |
| if (shift > 0) { |
| /* Pad with 1 byte instructions so looks nice in debuggers. |
| * decode_fragment also checks for this as a sanity check. Note, |
| * while these instructions can never be reached, they will be decoded |
| * by shift fcache pointers so must put something valid here. */ |
| SET_TO_DEBUG(startpc, shift); |
| startpc += shift; |
| STATS_PAD_JMPS_ADD(f->flags, num_shifted_stubs, 1); |
| STATS_PAD_JMPS_ADD(f->flags, shifted_stub_bytes, shift); |
| } else { |
| STATS_PAD_JMPS_ADD(f->flags, num_stubs_no_shift, 1); |
| } |
| return startpc; |
| } |
| |
| /* Only used if -no_pad_jmps_shift_{bb,trace}. FIXME this routine is expensive (the |
| * instr_expand) and we may end up removing app nops (an optimizations but |
| * not really what we're after here). */ |
| void |
| remove_nops_from_ilist(dcontext_t *dcontext, instrlist_t *ilist |
| _IF_DEBUG(bool recreating)) |
| { |
| instr_t *inst, *next_inst; |
| |
| for (inst = instrlist_first(ilist); inst != NULL; inst = next_inst) { |
| /* FIXME : expensive, just expand instr before cti, function not used |
| * if -no_pad_jmps_shift_{bb,trace} */ |
| inst = instr_expand(dcontext, ilist, inst); |
| next_inst = instr_get_next(inst); |
| if (instr_is_nop(inst)) { |
| instrlist_remove(ilist, inst); |
| DOSTATS({ |
| if (!recreating) { |
| STATS_INC(num_nops_removed); |
| STATS_ADD(num_nop_bytes_removed, instr_length(dcontext, inst)); |
| } |
| }); |
| instr_destroy(dcontext, inst); |
| } |
| } |
| } |
| |
| cache_pc |
| get_direct_exit_target(dcontext_t *dcontext, uint flags) |
| { |
| if (FRAG_DB_SHARED(flags)) { |
| if (TEST(FRAG_COARSE_GRAIN, flags)) { |
| /* note that entrance stubs should target their unit's prefix, |
| * who will then target this routine |
| */ |
| return fcache_return_coarse_routine(IF_X64(FRAGMENT_GENCODE_MODE(flags))); |
| } else |
| return fcache_return_shared_routine(IF_X64(FRAGMENT_GENCODE_MODE(flags))); |
| } else { |
| return fcache_return_routine_ex(dcontext _IF_X64(FRAGMENT_GENCODE_MODE(flags))); |
| } |
| } |
| |
| int |
| insert_exit_stub(dcontext_t *dcontext, fragment_t *f, |
| linkstub_t *l, cache_pc stub_pc) |
| { |
| return insert_exit_stub_other_flags(dcontext, f, l, stub_pc, l->flags); |
| } |
| |
| /* Patch the (direct) branch at branch_pc so it branches to target_pc |
| * The write that actually patches the branch is done atomically so this |
| * function is safe with respect to a thread executing this branch presuming |
| * that both the before and after targets are valid and that [pc, pc+4) does |
| * not cross a cache line. |
| */ |
| void |
| patch_branch(cache_pc branch_pc, cache_pc target_pc, bool hot_patch) |
| { |
| cache_pc byte_ptr = exit_cti_disp_pc(branch_pc); |
| insert_relative_target(byte_ptr, target_pc, hot_patch); |
| } |
| |
| #ifdef PROFILE_LINKCOUNT |
| static byte * |
| change_linkcount_target(byte *pc, app_pc target) |
| { |
| /* Once we've linked once, we modify the jmp at the end of the |
| * link code in the stub to either jmp to the unlinked entry |
| * (which has no counter inc code of its own, that's why the exit |
| * jmp doesn't go straight there) or to the target. |
| * To find the jmp, watch first opcode to determine which state |
| * stub is in (depending on whether had to save eflags or not). |
| */ |
| if (*pc == 0xff || *pc == 0x83) { /* inc/add is 1st instr */ |
| pc += LINKCOUNT_INCSIZE + 1; |
| } else { |
| IF_X64(ASSERT_NOT_IMPLEMENTED(false)); /* need to pass in flags */ |
| pc += LINKCOUNT_INCSIZE + LINKCOUNT_FLAGSIZE + STUB_DIRECT_SIZE(FRAG_32_BIT) - 4; |
| } |
| pc = insert_relative_target(pc, target, HOT_PATCHABLE); |
| return pc; |
| } |
| |
| static void |
| optimize_linkcount_stub(dcontext_t *dcontext, fragment_t *f, |
| linkstub_t *l, fragment_t *targetf) |
| { |
| /* first-time link: try to remove eflags save/restore */ |
| # ifdef CUSTOM_EXIT_STUBS |
| byte *stub_pc = (byte *) EXIT_FIXED_STUB_PC(dcontext, f, l); |
| # else |
| byte *stub_pc = (byte *) EXIT_STUB_PC(dcontext, f, l); |
| # endif |
| byte *pc = stub_pc; |
| bool remove_eflags_save = false; |
| ASSERT(linkstub_owned_by_fragment(dcontext, f, l)); |
| ASSERT(LINKSTUB_DIRECT(l->flags)); |
| |
| if (!INTERNAL_OPTION(unsafe_ignore_eflags_prefix)) { |
| remove_eflags_save = TEST(FRAG_WRITES_EFLAGS_6, targetf->flags); |
| } |
| else { |
| /* scan through code at target fragment, stop scanning at 1st branch */ |
| uint eflags = 0; |
| cache_pc end_pc = EXIT_CTI_PC(f, FRAGMENT_EXIT_STUBS(targetf)); |
| byte *fpc = (byte *) FCACHE_ENTRY_PC(targetf); |
| /* for simplicity, stop at first instr that touches the flags */ |
| while (eflags == 0 && fpc != NULL && ((cache_pc)fpc) < end_pc) { |
| fpc = decode_eflags_usage(dcontext, fpc, &eflags); |
| } |
| remove_eflags_save = |
| (eflags & (EFLAGS_WRITE_6|EFLAGS_READ_6)) == EFLAGS_WRITE_6; |
| } |
| if (remove_eflags_save) { |
| /* the 6 flags modified by add and adc are written before |
| * they're read -> don't need to save eflags! |
| * |
| * I tried replacing lahf & sahf w/ nops, it's noticeably |
| * faster to not have the nops, so redo the increment: |
| */ |
| pc = insert_linkcount_inc(pc, l); |
| pc = insert_relative_jump(pc, FCACHE_ENTRY_PC(targetf), |
| NOT_HOT_PATCHABLE); |
| /* Fill out with nops till the unlinked entry point so disassembles |
| * nicely for logfile (we're profile linkcount so presumably going |
| * to dump this). */ |
| while (pc < (stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags))) { |
| *pc = 0x90; pc++; /* nop */ |
| } |
| } else { |
| /* keep eflags save & restore -- need to keep save of eax |
| * so skip all that now, go to right before store of &l into eax |
| */ |
| pc += LINKCOUNT_DIRECT_EXTRA(f->flags) - 5 - 5; |
| /* need to insert a restore of eax -- luckily it perfectly |
| * overwrites the store of &l into eax, FIXME - dangerous |
| * though, if we ever drop the addr16 flag on a shared restore the |
| * instruction will be 6 bytes and our hardcoded 5 above will |
| * lead to a crash (should trigger assert below at least). |
| */ |
| pc = insert_restore_xax(dcontext, pc, f->flags, FRAG_DB_SHARED(f->flags), |
| DIRECT_STUB_SPILL_SLOT, true); |
| ASSERT(pc == stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags) - 5); |
| /* now add jmp */ |
| pc = insert_relative_jump(pc, FCACHE_ENTRY_PC(targetf), |
| NOT_HOT_PATCHABLE); |
| } |
| |
| /* we need to replace our never-linked sentinel w/ the real |
| * unlinked entry point. |
| */ |
| ASSERT(pc == stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags)); |
| pc = stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags); |
| ASSERT(*((uint *)pc) == LINKCOUNT_NEVER_LINKED_SENTINEL); |
| pc = insert_save_xax(dcontext, pc, f->flags, FRAG_DB_SHARED(f->flags), |
| DIRECT_STUB_SPILL_SLOT, true); |
| /* mov $linkstub_ptr,%xax */ |
| *pc = MOV_IMM2XAX_OPCODE; pc++; |
| IF_X64(ASSERT_NOT_IMPLEMENTED(false)); |
| *((uint *)pc) = (uint)l; pc += 4; |
| /* jmp to target */ |
| pc = insert_relative_jump(pc, get_direct_exit_target(dcontext, f->flags), |
| NOT_HOT_PATCHABLE); |
| } |
| #endif /* PROFILE_LINKCOUNT */ |
| |
| /* Checks patchable exit cti for proper alignment for patching. If it's |
| * properly aligned returns 0, else returns the number of bytes it would |
| * need to be forward shifted to be properly aligned */ |
| uint |
| patchable_exit_cti_align_offs(dcontext_t *dcontext, instr_t *inst, cache_pc pc) |
| { |
| /* all our exit cti's currently use 4 byte offsets */ |
| /* FIXME : would be better to use a instr_is_cti_long or some such |
| * also should check for addr16 flag (we shouldn't have any prefixes) */ |
| ASSERT((instr_is_cti(inst) && !instr_is_cti_short(inst) && |
| !TESTANY(~(PREFIX_JCC_TAKEN|PREFIX_JCC_NOT_TAKEN), instr_get_prefixes(inst))) |
| || instr_is_cti_short_rewrite(inst, NULL)); |
| IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint |
| (ALIGN_SHIFT_SIZE(pc + instr_length(dcontext, inst) - CTI_PATCH_SIZE, |
| CTI_PATCH_SIZE, PAD_JMPS_ALIGNMENT)))); |
| return (uint) ALIGN_SHIFT_SIZE(pc + instr_length(dcontext, inst) - CTI_PATCH_SIZE, |
| CTI_PATCH_SIZE, PAD_JMPS_ALIGNMENT); |
| } |
| |
| /* Returns true if the exit cti is ever dynamically modified */ |
| bool |
| is_exit_cti_patchable(dcontext_t *dcontext, instr_t *inst, uint frag_flags) |
| { |
| app_pc target; |
| if (TEST(FRAG_COARSE_GRAIN, frag_flags)) { |
| /* Case 8647: coarse grain fragment bodies always link through stubs |
| * until frozen, so their ctis are never patched except at freeze time |
| * when we suspend the world. |
| */ |
| ASSERT(!TEST(FRAG_IS_TRACE, frag_flags)); |
| return false; |
| } |
| ASSERT(instr_is_exit_cti(inst)); |
| target = instr_get_branch_target_pc(inst); |
| if (is_indirect_branch_lookup_routine(dcontext, target)) { |
| /* whether has an inline stub or not, cti is always |
| * patched if -no_indirect_stubs |
| */ |
| if (!DYNAMO_OPTION(indirect_stubs)) |
| return true; |
| #ifdef WINDOWS |
| if (target != shared_syscall_routine(dcontext)) { |
| #endif |
| return get_ibl_routine_code(dcontext, |
| extract_branchtype((ushort)instr_exit_branch_type(inst)), |
| frag_flags)->ibl_head_is_inlined; |
| #ifdef WINDOWS |
| } |
| return false; |
| #endif |
| } else { |
| /* direct exit */ |
| #ifdef PROFILE_LINKCOUNT |
| if (DYNAMO_OPTION(profile_counts) && TEST(FRAG_IS_TRACE, frag_flags)) { |
| # ifdef CUSTOM_EXIT_STUBS |
| return true; |
| # else |
| return false; |
| # endif |
| } |
| #endif |
| if (instr_branch_special_exit(inst)) |
| return false; |
| return true; |
| } |
| } |
| |
| /* returns true if exit cti no longer points at stub |
| * (certain situations, like profiling or TRACE_HEAD_CACHE_INCR, go |
| * through the stub even when linked) |
| */ |
| bool |
| link_direct_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l, fragment_t *targetf, |
| bool hot_patch) |
| { |
| #if defined(PROFILE_LINKCOUNT) || defined(TRACE_HEAD_CACHE_INCR) |
| # ifdef CUSTOM_EXIT_STUBS |
| byte *stub_pc = (byte *) (EXIT_FIXED_STUB_PC(dcontext, f, l)); |
| # else |
| byte *stub_pc = (byte *) (EXIT_STUB_PC(dcontext, f, l)); |
| # endif |
| #endif |
| ASSERT(linkstub_owned_by_fragment(dcontext, f, l)); |
| ASSERT(LINKSTUB_DIRECT(l->flags)); |
| STATS_INC(num_direct_links); |
| |
| #ifdef PROFILE_LINKCOUNT |
| if (dynamo_options.profile_counts && TEST(FRAG_IS_TRACE, f->flags)) { |
| /* do not change the exit jmp, instead change the stub itself */ |
| if (*((uint *)(stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags))) == |
| LINKCOUNT_NEVER_LINKED_SENTINEL) { |
| /* this is not atomic, but that's ok, it's first-time only */ |
| /* FIXME - this assumption is so not safe with shared cache |
| * since we add to table and link incoming before linking outgoing |
| */ |
| optimize_linkcount_stub(dcontext, f, l, targetf); |
| # ifdef CUSTOM_EXIT_STUBS |
| /* FIXME: want flag that says whether should go through custom |
| * only when unlinked, or always! |
| * For now we assume only when unlinked: |
| */ |
| /* skip custom code */ |
| patch_branch(EXIT_CTI_PC(f, l), stub_pc, |
| TEST(FRAG_SHARED, f->flags) ? hot_patch : NOT_HOT_PATCHABLE); |
| # endif |
| } else { |
| # ifdef CUSTOM_EXIT_STUBS |
| /* FIXME: want flag that says whether should go through custom |
| * only when unlinked, or always! |
| * For now we assume only when unlinked: |
| */ |
| /* skip custom code */ |
| patch_branch(EXIT_CTI_PC(f, l), stub_pc, hot_patch); |
| # endif |
| change_linkcount_target(stub_pc, FCACHE_ENTRY_PC(targetf)); |
| } |
| # ifdef TRACE_HEAD_CACHE_INCR |
| /* yes, we wait for linkcount to do its thing and then we change it -- |
| * but to make it more efficient will make this already ungainly |
| * code even harder to read |
| */ |
| /* FIXME - atomicity issues? */ |
| if ((targetf->flags & FRAG_IS_TRACE_HEAD) != 0) { |
| /* after optimized inc, jmp to unlinked code, but change its final |
| * jmp to go to incr routine |
| */ |
| change_linkcount_target(stub_pc, stub_pc + LINKCOUNT_UNLINKED_ENTRY(f->flags)); |
| LOG(THREAD, LOG_LINKS, 4, |
| "\tlinking F%d."PFX" to incr routine b/c F%d is trace head\n", |
| f->id, EXIT_CTI_PC(f, l), targetf->id); |
| patch_branch(stub_pc + LINKCOUNT_UNLINKED_ENTRY(f->flags) + 10, |
| trace_head_incr_routine(dcontext), hot_patch); |
| } |
| # endif |
| return false; /* going through stub */ |
| } |
| #endif /* PROFILE_LINKCOUNT */ |
| |
| #ifdef TRACE_HEAD_CACHE_INCR |
| if ((targetf->flags & FRAG_IS_TRACE_HEAD) != 0) { |
| LOG(THREAD, LOG_LINKS, 4, |
| "\tlinking F%d."PFX" to incr routine b/c F%d is trace head\n", |
| f->id, EXIT_CTI_PC(f, l), targetf->id); |
| /* FIXME: more efficient way than multiple calls to get size-5? */ |
| ASSERT(linkstub_size(dcontext, f, l) == DIRECT_EXIT_STUB_SIZE(f->flags)); |
| patch_branch(stub_pc + DIRECT_EXIT_STUB_SIZE(f->flags) - 5, |
| trace_head_incr_routine(dcontext), hot_patch); |
| return false; /* going through stub */ |
| } |
| #endif |
| |
| /* change jmp target to point to the passed-in target */ |
| #ifdef UNSUPPORTED_API |
| if ((l->flags & LINK_TARGET_PREFIX) != 0) { |
| /* want to target just the xcx restore, not the eflags restore |
| * (only ibl targets eflags restore) |
| */ |
| patch_branch(EXIT_CTI_PC(f, l), FCACHE_PREFIX_ENTRY_PC(targetf), |
| hot_patch); |
| } else |
| #endif |
| patch_branch(EXIT_CTI_PC(f, l), FCACHE_ENTRY_PC(targetf), hot_patch); |
| return true; /* do not need stub anymore */ |
| } |
| |
| void |
| unlink_direct_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l) |
| { |
| cache_pc stub_pc = (cache_pc) EXIT_STUB_PC(dcontext, f, l); |
| #ifdef TRACE_HEAD_CACHE_INCR |
| direct_linkstub_t *dl = (direct_linkstub_t *) l; |
| #endif |
| ASSERT(linkstub_owned_by_fragment(dcontext, f, l)); |
| ASSERT(LINKSTUB_DIRECT(l->flags)); |
| |
| #ifdef PROFILE_LINKCOUNT |
| if (dynamo_options.profile_counts && TEST(FRAG_IS_TRACE, f->flags)) { |
| byte *pc; |
| if (*((uint *)(stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags))) == |
| LINKCOUNT_NEVER_LINKED_SENTINEL) { |
| /* never been linked, don't go pointing at the uninitialized |
| * unlink entry point -- just return, initial state is fine |
| */ |
| return; |
| } |
| # ifdef CUSTOM_EXIT_STUBS |
| pc = (byte *) (EXIT_FIXED_STUB_PC(dcontext, f, l)); |
| stub_pc = (cache_pc) pc; |
| /* FIXME: want flag that says whether should go through custom |
| * only when unlinked, or always! Also is racy with 2nd branch patch. |
| * For now we assume only when unlinked. |
| */ |
| /* go through custom code again */ |
| patch_branch(EXIT_CTI_PC(f, l), stub_pc, HOT_PATCHABLE); |
| # else |
| pc = (byte *) stub_pc; |
| # endif |
| # ifdef TRACE_HEAD_CACHE_INCR |
| if (dl->target_fragment != NULL) { /* HACK to tell if targeted trace head */ |
| /* make unlinked jmp go back to fcache_return */ |
| patch_branch(pc + LINKCOUNT_UNLINKED_ENTRY(f->flags) + 10, |
| get_direct_exit_target(dcontext, f->flags), |
| HOT_PATCHABLE); |
| } else |
| # endif |
| /* make jmp after incr go to unlinked entry */ |
| change_linkcount_target(pc, stub_pc + LINKCOUNT_UNLINKED_ENTRY(f->flags)); |
| return; |
| } |
| #endif |
| |
| #ifdef TRACE_HEAD_CACHE_INCR |
| if (dl->target_fragment != NULL) { /* HACK to tell if targeted trace head */ |
| # ifdef CUSTOM_EXIT_STUBS |
| byte *pc = (byte *) (EXIT_FIXED_STUB_PC(dcontext, f, l)); |
| # else |
| byte *pc = (byte *) (EXIT_STUB_PC(dcontext, f, l)); |
| # endif |
| /* FIXME: more efficient way than multiple calls to get size-5? */ |
| ASSERT(linkstub_size(dcontext, f, l) == DIRECT_EXIT_STUB_SIZE(f->flags)); |
| patch_branch(pc + DIRECT_EXIT_STUB_SIZE(f->flags) - 5, |
| get_direct_exit_target(dcontext, f->flags), |
| HOT_PATCHABLE); |
| } |
| #endif |
| |
| /* change jmp target to point to top of exit stub */ |
| patch_branch(EXIT_CTI_PC(f, l), stub_pc, HOT_PATCHABLE); |
| } |
| |
| /* NOTE : for inlined indirect branches linking is !NOT! atomic with respect |
| * to a thread executing in the cache unless using the atomic_inlined_linking |
| * option (unlike unlinking) |
| */ |
| void |
| link_indirect_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l, bool hot_patch) |
| { |
| app_pc target_tag = EXIT_TARGET_TAG(dcontext, f, l); |
| /* w/ indirect exits now having their stub pcs computed based |
| * on the cti targets, we must calculate them at a consistent |
| * state (we do have multi-stage modifications for inlined stubs) |
| */ |
| byte *stub_pc = (byte *) EXIT_STUB_PC(dcontext, f, l); |
| #ifdef CUSTOM_EXIT_STUBS |
| byte *fixed_stub_pc = (byte *) EXIT_FIXED_STUB_PC(dcontext, f, l); |
| #endif |
| |
| ASSERT(!TEST(FRAG_COARSE_GRAIN, f->flags)); |
| |
| ASSERT(linkstub_owned_by_fragment(dcontext, f, l)); |
| ASSERT(LINKSTUB_INDIRECT(l->flags)); |
| /* target is always the same, so if it's already linked, this is a nop */ |
| if ((l->flags & LINK_LINKED) != 0) { |
| STATS_INC(num_indirect_already_linked); |
| return; |
| } |
| STATS_INC(num_indirect_links); |
| |
| # ifdef WINDOWS |
| if (!is_shared_syscall_routine(dcontext, target_tag)) |
| # endif |
| { |
| ibl_code_t *ibl_code = |
| get_ibl_routine_code(dcontext, |
| extract_branchtype(l->flags), f->flags); |
| |
| if (ibl_code->ibl_head_is_inlined) { |
| /* need to make branch target the top of the exit stub */ |
| patch_branch(EXIT_CTI_PC(f, l), stub_pc, hot_patch); |
| if (DYNAMO_OPTION(atomic_inlined_linking)) { |
| return; |
| } |
| } |
| } |
| |
| link_indirect_exit_arch(dcontext, f, l, hot_patch, target_tag); |
| } |
| |
| int |
| linkstub_unlink_entry_offset(dcontext_t *dcontext, fragment_t *f, linkstub_t *l) |
| { |
| ibl_code_t *ibl_code; |
| ASSERT(linkstub_owned_by_fragment(dcontext, f, l)); |
| if (!LINKSTUB_INDIRECT(l->flags)) |
| return 0; |
| #ifdef WINDOWS |
| if (is_shared_syscall_routine(dcontext, EXIT_TARGET_TAG(dcontext, f, l))) |
| return 0; |
| #endif |
| ibl_code = get_ibl_routine_code(dcontext, extract_branchtype(l->flags), f->flags); |
| if (ibl_code->ibl_head_is_inlined) |
| return ibl_code->inline_unlink_offs; |
| else |
| return 0; |
| } |
| |
| cache_pc |
| indirect_linkstub_target(dcontext_t *dcontext, fragment_t *f, linkstub_t *l) |
| { |
| ASSERT(LINKSTUB_INDIRECT(l->flags)); |
| ASSERT(!TESTANY(LINK_NI_SYSCALL_ALL, l->flags)); |
| #ifdef WINDOWS |
| if (EXIT_TARGETS_SHARED_SYSCALL(l->flags)) { |
| /* currently this is the only way to distinguish shared_syscall |
| * exit from other indirect exits and from other exits in |
| * a fragment containing ignorable or non-ignorable syscalls |
| */ |
| ASSERT(TEST(FRAG_HAS_SYSCALL, f->flags)); |
| return shared_syscall_routine_ex(dcontext |
| _IF_X64(FRAGMENT_GENCODE_MODE(f->flags))); |
| } |
| #endif |
| if (TEST(FRAG_COARSE_GRAIN, f->flags)) { |
| /* Need to target the ibl prefix. Passing in cti works as well as stub, |
| * and avoids a circular dependence where linkstub_unlink_entry_offset() |
| * call this routine to get the target and then this routine asks for |
| * the stub which calls linkstub_unlink_entry_offset()... |
| */ |
| return get_coarse_ibl_prefix(dcontext, EXIT_CTI_PC(f, l), |
| extract_branchtype(l->flags)); |
| } else { |
| return get_ibl_routine_ex(dcontext, get_ibl_entry_type(l->flags), |
| get_source_fragment_type(dcontext, f->flags), |
| extract_branchtype(l->flags) |
| _IF_X64(FRAGMENT_GENCODE_MODE(f->flags))); |
| } |
| } |
| |
| /* based on machine state, returns which of cbr l1 and fall-through l2 |
| * must have been taken |
| */ |
| linkstub_t * |
| linkstub_cbr_disambiguate(dcontext_t *dcontext, fragment_t *f, |
| linkstub_t *l1, linkstub_t *l2) |
| { |
| instr_t instr; |
| linkstub_t *taken; |
| instr_init(dcontext, &instr); |
| decode(dcontext, EXIT_CTI_PC(f, l1), &instr); |
| ASSERT(instr_is_cbr(&instr)); |
| if (instr_cbr_taken(&instr, get_mcontext(dcontext), false/*post-state*/)) |
| taken = l1; |
| else |
| taken = l2; |
| instr_free(dcontext, &instr); |
| return taken; |
| } |
| |
| |
| /******************************************************************************* |
| * COARSE-GRAIN FRAGMENT SUPPORT |
| */ |
| |
| |
| /* FIXME: case 10334: pass in info? */ |
| bool |
| coarse_is_trace_head(cache_pc stub) |
| { |
| if (coarse_is_entrance_stub(stub)) { |
| cache_pc tgt = entrance_stub_jmp_target(stub); |
| /* FIXME: could see if tgt is a jmp and deref and cmp to |
| * trace_head_return_coarse_routine() to avoid the vmvector |
| * lookup required to find the prefix |
| */ |
| return tgt == trace_head_return_coarse_prefix(stub, NULL); |
| } |
| return false; |
| } |
| |
| cache_pc |
| entrance_stub_jmp_target(cache_pc stub) |
| { |
| cache_pc jmp = entrance_stub_jmp(stub); |
| cache_pc tgt; |
| ASSERT(jmp != NULL); |
| tgt = (cache_pc) PC_RELATIVE_TARGET(jmp+1); |
| #ifdef X86 |
| ASSERT(*jmp == JMP_OPCODE); |
| #elif defined(ARM) |
| /* FIXMED i#1551: NYI on ARM */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| #endif /* X86/ARM */ |
| return tgt; |
| } |
| |
| app_pc |
| entrance_stub_target_tag(cache_pc stub, coarse_info_t *info) |
| { |
| cache_pc jmp = entrance_stub_jmp(stub); |
| app_pc tag; |
| /* find the immed that is put into tls: at end of pre-jmp instr */ |
| #ifdef X64 |
| /* To identify whether 32-bit: we could look up the coarse_info_t |
| * this is part of but that's expensive so we check whether the |
| * tls offset has 2 high byte 0's (we always use addr16 for 32-bit). |
| * 32-bit: |
| * 67 64 c7 06 e0 0e 02 99 4e 7d addr16 mov $0x7d4e9902 -> %fs:0x0ee0 |
| * 64-bit is split into high and low dwords: |
| * 65 c7 04 25 20 16 00 00 02 99 4e 7d mov $0x7d4e9902 -> %gs:0x1620 |
| * 65 c7 04 25 24 16 00 00 00 00 00 00 mov $0x00000000 -> %gs:0x1624 |
| * both are followed by a direct jmp. |
| */ |
| if (*((ushort *)(jmp-6)) == 0) { /* 64-bit has 2 0's for high 2 bytes of tls offs */ |
| ptr_uint_t high32 = (ptr_uint_t) *((uint *)(jmp-4)); |
| ptr_uint_t low32 = (ptr_uint_t) |
| *((uint *)(jmp - (SIZE64_MOV_PTR_IMM_TO_TLS/2) - 4)); |
| tag = (cache_pc) ((high32 << 32) | low32); |
| } else { /* else fall-through to 32-bit case */ |
| #endif |
| tag = *((cache_pc *)(jmp-4)); |
| #ifdef X64 |
| } |
| #endif |
| /* if frozen, this could be a persist-time app pc (i#670). |
| * we take in info so we can know mod_shift (we can decode to find it |
| * for unlinked but not for linked) |
| */ |
| if (info == NULL) |
| info = get_stub_coarse_info(stub); |
| if (info->mod_shift != 0 && |
| tag >= info->persist_base && |
| tag < info->persist_base + (info->end_pc - info->base_pc)) |
| tag -= info->mod_shift; |
| return tag; |
| } |
| |
| bool |
| coarse_is_indirect_stub(cache_pc pc) |
| { |
| /* match insert_jmp_to_ibl */ |
| return instr_raw_is_tls_spill(pc, SCRATCH_REG1/*xbx/r1*/, INDIRECT_STUB_SPILL_SLOT); |
| } |
| |
| /* caller should call fragment_coarse_entry_pclookup() ahead of time |
| * to avoid deadlock if caller holds info->lock |
| */ |
| bool |
| coarse_cti_is_intra_fragment(dcontext_t *dcontext, coarse_info_t *info, |
| instr_t *inst, cache_pc start_pc) |
| { |
| /* We don't know the size of the fragment but we want to support |
| * intra-fragment ctis for clients (i#665) so we use some |
| * heuristics. A real cti is either linked to a target within the |
| * same coarse unit (where its target will be an entry point) or |
| * points at a stub of some kind (frozen exit prefix or separate |
| * entrance stub or inlined indirect stub). |
| */ |
| cache_pc tgt = opnd_get_pc(instr_get_target(inst)); |
| if (tgt < start_pc || |
| tgt >= start_pc + MAX_FRAGMENT_SIZE || |
| /* if tgt is an entry, then it's a linked exit cti |
| * XXX: this may acquire info->lock if it's never been called before |
| */ |
| fragment_coarse_entry_pclookup(dcontext, info, tgt) != NULL || |
| /* these lookups can get expensive but should only hit them |
| * when have clients adding intra-fragment ctis. |
| * XXX: is there a min distance we could use to rule out |
| * being in stubs? for frozen though prefixes are |
| * right after cache. |
| */ |
| coarse_is_indirect_stub(tgt) || |
| in_coarse_stubs(tgt) || |
| in_coarse_stub_prefixes(tgt)) { |
| return false; |
| } else |
| return true; |
| } |
| |
| cache_pc |
| coarse_indirect_stub_jmp_target(cache_pc stub) |
| { |
| #ifdef X86 |
| cache_pc prefix_tgt, tgt; |
| cache_pc jmp; |
| size_t stub_size; |
| # ifdef X64 |
| /* See the stub sequences in entrance_stub_target_tag(): 32-bit always has |
| * an addr prefix while 64-bit does not |
| */ |
| /* FIXME: PR 209709: test perf and remove if outweighs space */ |
| if (*stub == ADDR_PREFIX_OPCODE) |
| stub_size = STUB_COARSE_INDIRECT_SIZE(FRAG_32_BIT); |
| else /* default */ |
| # endif |
| stub_size = STUB_COARSE_INDIRECT_SIZE(0); |
| jmp = stub + stub_size - JMP_LONG_LENGTH; |
| ASSERT(*jmp == JMP_OPCODE); |
| prefix_tgt = (cache_pc) PC_RELATIVE_TARGET(jmp+1); |
| ASSERT(*prefix_tgt == JMP_OPCODE); |
| tgt = (cache_pc) PC_RELATIVE_TARGET(prefix_tgt+1); |
| return tgt; |
| #elif defined(ARM) |
| /* FIXMED i#1551: NYI on ARM */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| return NULL; |
| #endif /* X86/ARM */ |
| } |
| |
| uint |
| coarse_indirect_stub_size(coarse_info_t *info) |
| { |
| /* Keep in synch w/ exit_stub_size(). We export this separately since |
| * it's difficult to get the target to pass to exit_stub_size(). |
| */ |
| return STUB_COARSE_INDIRECT_SIZE(COARSE_32_FLAG(info)); |
| } |
| |
| /* Passing in stub's info avoids a vmvector lookup */ |
| bool |
| entrance_stub_linked(cache_pc stub, coarse_info_t *info /*OPTIONAL*/) |
| { |
| /* entrance stubs are of two types: |
| * - targeting trace heads: always point to trace_head_return_coarse, |
| * whether target exists or not, so are always unlinked; |
| * - targeting non-trace-heads: if linked, point to fragment; if unlinked, |
| * point to fcache_return_coarse |
| */ |
| cache_pc tgt = entrance_stub_jmp_target(stub); |
| /* FIXME: do vmvector just once instead of for each call */ |
| return (tgt != trace_head_return_coarse_prefix(stub, info) && |
| tgt != fcache_return_coarse_prefix(stub, info)); |
| } |
| |
| /* Returns whether it had to change page protections */ |
| static bool |
| patch_coarse_branch(cache_pc stub, cache_pc tgt, bool hot_patch, |
| coarse_info_t *info /*OPTIONAL*/) |
| { |
| bool stubs_readonly = false; |
| bool stubs_restore = false; |
| if (DYNAMO_OPTION(persist_protect_stubs)) { |
| if (info == NULL) |
| info = get_stub_coarse_info(stub); |
| ASSERT(info != NULL); |
| if (info->stubs_readonly) { |
| stubs_readonly = true; |
| stubs_restore = true; |
| /* if we don't preserve mapped-in COW state the protection change |
| * will fail (case 10570) |
| */ |
| make_copy_on_writable((byte *)PAGE_START(entrance_stub_jmp(stub)), |
| /* stub jmp can't cross page boundary (can't |
| * cross cache line in fact) */ |
| PAGE_SIZE); |
| if (DYNAMO_OPTION(persist_protect_stubs_limit) > 0) { |
| info->stubs_write_count++; |
| if (info->stubs_write_count > |
| DYNAMO_OPTION(persist_protect_stubs_limit)) { |
| SYSLOG_INTERNAL_WARNING_ONCE("pcache stubs over write limit"); |
| STATS_INC(pcache_unprot_over_limit); |
| stubs_restore = false; |
| info->stubs_readonly = false; |
| } |
| } |
| } |
| } |
| patch_branch(entrance_stub_jmp(stub), tgt, HOT_PATCHABLE); |
| if (stubs_restore) |
| make_unwritable((byte *)PAGE_START(entrance_stub_jmp(stub)), PAGE_SIZE); |
| return stubs_readonly; |
| } |
| |
| /* Passing in stub's info avoids a vmvector lookup */ |
| void |
| link_entrance_stub(dcontext_t *dcontext, cache_pc stub, cache_pc tgt, |
| bool hot_patch, coarse_info_t *info /*OPTIONAL*/) |
| { |
| ASSERT(DYNAMO_OPTION(coarse_units)); |
| ASSERT(self_owns_recursive_lock(&change_linking_lock)); |
| LOG(THREAD, LOG_LINKS, 5, "link_entrance_stub "PFX"\n", stub); |
| if (patch_coarse_branch(stub, tgt, hot_patch, info)) |
| STATS_INC(pcache_unprot_link); |
| /* We check this afterward since this link may be what makes it consistent |
| * FIXME: pass in arg to not check target? Then call before and after */ |
| ASSERT(coarse_is_entrance_stub(stub)); |
| } |
| |
| /* Passing in stub's info avoids a vmvector lookup */ |
| void |
| unlink_entrance_stub(dcontext_t *dcontext, cache_pc stub, uint flags, |
| coarse_info_t *info /*OPTIONAL*/) |
| { |
| cache_pc tgt; |
| ASSERT(DYNAMO_OPTION(coarse_units)); |
| ASSERT(coarse_is_entrance_stub(stub)); |
| ASSERT(self_owns_recursive_lock(&change_linking_lock)); |
| LOG(THREAD, LOG_LINKS, 5, |
| "unlink_entrance_stub "PFX"\n", stub); |
| if (TESTANY(FRAG_IS_TRACE_HEAD|FRAG_IS_TRACE, flags)) |
| tgt = trace_head_return_coarse_prefix(stub, info); |
| else |
| tgt = fcache_return_coarse_prefix(stub, info); |
| if (patch_coarse_branch(stub, tgt, HOT_PATCHABLE, info)) |
| STATS_INC(pcache_unprot_unlink); |
| } |
| |
| cache_pc |
| entrance_stub_from_cti(cache_pc cti) |
| { |
| cache_pc disp = exit_cti_disp_pc(cti); |
| cache_pc tgt = (cache_pc) PC_RELATIVE_TARGET(disp); |
| return tgt; |
| } |
| |
| /*******************************************************************************/ |
| |
| /* Patch list support routines */ |
| void |
| init_patch_list(patch_list_t *patch, patch_list_type_t type) |
| { |
| patch->num_relocations = 0; |
| /* Cast to int to avoid a tautological comparison warning from clang. */ |
| ASSERT_TRUNCATE(patch->type, ushort, (int)type); |
| patch->type = (ushort) type; |
| } |
| |
| /* add an instruction to patch list and address of location for future updates */ |
| /* Use the type checked wrappers add_patch_entry or add_patch_marker */ |
| void |
| add_patch_entry_internal(patch_list_t *patch, instr_t *instr, ushort patch_flags, |
| short instruction_offset, |
| ptr_uint_t value_location_offset) |
| { |
| uint i = patch->num_relocations; |
| |
| ASSERT(patch->num_relocations < MAX_PATCH_ENTRIES); |
| /* Since in debug build we have the extra slots for stats, it's important |
| * to provide a useful release build message |
| */ |
| if (patch->num_relocations >= MAX_PATCH_ENTRIES) { |
| SYSLOG_CUSTOM_NOTIFY(SYSLOG_CRITICAL, MSG_EXCEPTION, 4, |
| "Maximum patch entries exceeded", |
| get_application_name(), get_application_pid(), |
| "<maxpatch>", "Maximum patch entries exceeded"); |
| os_terminate(get_thread_private_dcontext(), TERMINATE_PROCESS); |
| ASSERT_NOT_REACHED(); |
| } |
| |
| LOG(THREAD_GET, LOG_EMIT, 4, |
| "add_patch_entry[%d] value_location_offset="PFX"\n", i, |
| value_location_offset); |
| |
| patch->entry[i].where.instr = instr; |
| patch->entry[i].patch_flags = patch_flags; |
| patch->entry[i].value_location_offset = value_location_offset; |
| patch->entry[i].instr_offset = instruction_offset; |
| |
| patch->num_relocations++; |
| } |
| |
| |
| /* add an instruction to patch list to retrieve its offset later. |
| Takes an instruction and an offset within the instruction. |
| Result: The offset within an encoded instruction stream will |
| be stored in target_offset by encode_with_patch_list |
| */ |
| void |
| add_patch_marker(patch_list_t *patch, instr_t *instr, ushort patch_flags, |
| short instr_offset, ptr_uint_t *target_offset /* OUT */) |
| { |
| add_patch_entry_internal(patch, instr, (ushort) (patch_flags | PATCH_MARKER), |
| instr_offset, (ptr_uint_t) target_offset); |
| } |
| |
| /* remove PATCH_MARKER entries since not needed for dynamic updates */ |
| static INLINE_ONCE void |
| remove_assembled_patch_markers(dcontext_t *dcontext, patch_list_t *patch) |
| { |
| ushort i=0, j=0; |
| |
| /* we can remove the PATCH_MARKER entries after encoding, |
| and so patch_emitted_code won't even need to check for PATCH_MARKER |
| */ |
| |
| while (j < patch->num_relocations) { |
| if (TEST(PATCH_MARKER, patch->entry[j].patch_flags)) { |
| LOG(THREAD, LOG_EMIT, 4, |
| "remove_assembled_patch_markers: removing marker %d\n", j); |
| } else { |
| patch->entry[i] = patch->entry[j]; |
| i++; |
| } |
| |
| j++; |
| } |
| |
| LOG(THREAD, LOG_EMIT, 3, "remove_assembled_patch_markers: relocations %d, left only %d\n", |
| patch->num_relocations, i); |
| patch->num_relocations = i; |
| } |
| |
| |
| /* Indirect all instructions instead of later patching */ |
| static void |
| relocate_patch_list(dcontext_t *dcontext, patch_list_t *patch, |
| instrlist_t *ilist) |
| { |
| instr_t *inst; |
| uint cur = 0; |
| LOG(THREAD, LOG_EMIT, 3, "relocate_patch_list ["PFX"]\n", patch); |
| |
| /* go through the instructions and "relocate" by indirectly using XDI */ |
| for (inst = instrlist_first(ilist); inst; inst = instr_get_next(inst)) { |
| if (cur < patch->num_relocations && |
| inst == patch->entry[cur].where.instr) { |
| ASSERT(!TEST(PATCH_OFFSET_VALID, patch->entry[cur].patch_flags)); |
| |
| if (!TEST(PATCH_MARKER, patch->entry[cur].patch_flags)) { |
| opnd_t opnd; |
| ASSERT(instr_num_srcs(inst) > 0); |
| opnd = instr_get_src(inst, 0); |
| |
| DOLOG(4, LOG_EMIT, { |
| LOG(THREAD, LOG_EMIT, 2, |
| "encode_with_patch_list: patch_entry_t[%d] before update \n"); |
| instr_disassemble(dcontext, inst, THREAD); |
| LOG(THREAD, LOG_EMIT, 2, "\n"); |
| }); |
| /* we assume that per_thread_t will be in XDI, |
| and the displacement is in value_location_offset */ |
| IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int |
| (patch->entry[cur].value_location_offset))); |
| if (opnd_is_near_base_disp(opnd)) { |
| /* indirect through XDI and update displacement */ |
| opnd_set_disp(&opnd, (int) patch->entry[cur].value_location_offset); |
| opnd_replace_reg(&opnd, REG_NULL, SCRATCH_REG5/*xdi/r5*/); |
| } else if (opnd_is_immed_int(opnd)) { |
| /* indirect through XDI and set displacement */ |
| /* converting AND $0x00003fff, %xcx -> %xcx |
| into AND mask(%xdi), %xcx -> %xcx |
| */ |
| opnd = opnd_create_base_disp |
| (SCRATCH_REG5/*xdi/r5*/, REG_NULL, 0, |
| (int) patch->entry[cur].value_location_offset, OPSZ_4); |
| } |
| |
| instr_set_src(inst, 0, opnd); |
| DOLOG(3, LOG_EMIT, { |
| LOG(THREAD, LOG_EMIT, 2, |
| "encode_with_patch_list: patch_entry_t[%d] after update \n"); |
| instr_disassemble(dcontext, inst, THREAD); |
| LOG(THREAD, LOG_EMIT, 2, "\n"); |
| }); |
| } |
| cur++; |
| } |
| } |
| } |
| |
| /* Updates patch list with offsets in assembled instruction list */ |
| /* Cf: instrlist_encode which does not support a patch list */ |
| /* Returns length of emitted code */ |
| int |
| encode_with_patch_list(dcontext_t *dcontext, patch_list_t *patch, |
| instrlist_t *ilist, cache_pc start_pc) |
| { |
| instr_t *inst; |
| uint len; |
| uint cur; |
| cache_pc pc = start_pc; |
| |
| ASSERT(patch->num_relocations < MAX_PATCH_ENTRIES); |
| |
| if (patch->type == PATCH_TYPE_INDIRECT_XDI) { |
| relocate_patch_list(dcontext, patch, ilist); |
| } |
| |
| /* now encode the instructions */ |
| /* must set note fields first with offset */ |
| len = 0; |
| for (inst = instrlist_first(ilist); inst; inst = instr_get_next(inst)) { |
| instr_set_note(inst, (void *)(ptr_uint_t)len); |
| len += instr_length(dcontext, inst); |
| } |
| |
| cur = 0; |
| /* after instruction list is assembled we collect the offsets */ |
| for (inst = instrlist_first(ilist); inst; inst = instr_get_next(inst)) { |
| short offset_in_instr = patch->entry[cur].instr_offset; |
| byte *nxt_pc = instr_encode(dcontext, inst, pc); |
| ASSERT(nxt_pc != NULL); |
| len = (int) (nxt_pc - pc); |
| pc = nxt_pc; |
| |
| if (cur < patch->num_relocations && |
| inst == patch->entry[cur].where.instr) { |
| ASSERT(!TEST(PATCH_OFFSET_VALID, patch->entry[cur].patch_flags)); |
| |
| /* support positive offsets from beginning and negative - from end of instruction */ |
| if (offset_in_instr < 0) { |
| /* grab offset offset_in_instr bytes from the end of instruction */ |
| /* most commonly -4 for a 32bit immediate */ |
| patch->entry[cur].where.offset = |
| ((pc + offset_in_instr) - start_pc); |
| } else { |
| /* grab offset after skipping offset_in_instr from beginning of instruction */ |
| patch->entry[cur].where.offset = |
| ((pc - len + offset_in_instr) - start_pc); |
| } |
| patch->entry[cur].patch_flags |= PATCH_OFFSET_VALID; |
| |
| LOG(THREAD, LOG_EMIT, 4, |
| "encode_with_patch_list: patch_entry_t[%d] offset="PFX"\n", |
| cur, patch->entry[cur].where.offset); |
| |
| if (TEST(PATCH_MARKER, patch->entry[cur].patch_flags)) { |
| /* treat value_location_offset as an output argument |
| and store there the computed offset, |
| */ |
| ptr_uint_t *output_value = (ptr_uint_t *) |
| patch->entry[cur].value_location_offset; |
| ptr_uint_t output_offset = patch->entry[cur].where.offset; |
| if (TEST(PATCH_ASSEMBLE_ABSOLUTE, patch->entry[cur].patch_flags)) { |
| ASSERT(!TEST(PATCH_UINT_SIZED, patch->entry[cur].patch_flags)); |
| output_offset += (ptr_uint_t)start_pc; |
| } |
| if (TEST(PATCH_UINT_SIZED, patch->entry[cur].patch_flags)) { |
| IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint(output_offset))); |
| *((uint *)output_value) = (uint) output_offset; |
| } else |
| *output_value = output_offset; |
| } |
| |
| LOG(THREAD, LOG_EMIT, 4, |
| "encode_with_patch_list [%d] extras patch_flags=0x%x value_offset=" |
| PFX"\n", cur, patch->entry[cur].patch_flags, |
| patch->entry[cur].value_location_offset); |
| cur++; |
| } |
| } |
| |
| /* assuming patchlist is in the same order as ilist, we should have seen all */ |
| LOG(THREAD, LOG_EMIT, 4, "cur %d, num %d", cur, patch->num_relocations); |
| ASSERT(cur == patch->num_relocations); |
| |
| remove_assembled_patch_markers(dcontext, patch); |
| ASSERT(CHECK_TRUNCATE_TYPE_int(pc - start_pc)); |
| return (int)(pc - start_pc); |
| } |
| |
| #ifdef DEBUG |
| void |
| print_patch_list(patch_list_t *patch) |
| { |
| uint i; |
| LOG(THREAD_GET, LOG_EMIT, 4, "patch="PFX" num_relocations=%d\n", |
| patch, patch->num_relocations); |
| |
| for(i=0; i<patch->num_relocations; i++) { |
| ASSERT(TEST(PATCH_OFFSET_VALID, patch->entry[i].patch_flags)); |
| LOG(THREAD_GET, LOG_EMIT, 4, |
| "patch_list [%d] offset="PFX" patch_flags=%d value_offset="PFX"\n", i, |
| patch->entry[i].where.offset, |
| patch->entry[i].patch_flags, |
| patch->entry[i].value_location_offset); |
| } |
| } |
| |
| # ifdef INTERNAL |
| /* disassembles code adding patch list labels */ |
| static void |
| disassemble_with_annotations(dcontext_t *dcontext, patch_list_t *patch, |
| byte *start_pc, byte *end_pc) |
| { |
| byte *pc = start_pc; |
| uint cur = 0; |
| |
| do { |
| if (cur < patch->num_relocations && |
| pc >= start_pc + patch->entry[cur].where.offset) { |
| ASSERT(TEST(PATCH_OFFSET_VALID, patch->entry[cur].patch_flags)); |
| /* this is slightly off - we'll mark next instruction, |
| but is good enough for this purpose */ |
| LOG(THREAD, LOG_EMIT, 2, "%d:", cur); |
| cur++; |
| } else { |
| LOG(THREAD, LOG_EMIT, 2, " "); |
| } |
| |
| pc = disassemble_with_bytes(dcontext, pc, THREAD); |
| } while (pc < end_pc); |
| LOG(THREAD, LOG_EMIT, 2, "\n"); |
| } |
| # endif |
| #endif |
| |
| /* updates emitted code according to patch list */ |
| static void |
| patch_emitted_code(dcontext_t *dcontext, patch_list_t *patch, byte *start_pc) |
| { |
| uint i; |
| /* FIXME: can get this as a patch list entry through indirection */ |
| per_thread_t *pt = (per_thread_t *) dcontext->fragment_field; |
| ASSERT(dcontext != GLOBAL_DCONTEXT && dcontext != NULL); |
| |
| LOG(THREAD, LOG_EMIT, 2, "patch_emitted_code start_pc="PFX" pt="PFX"\n", |
| start_pc); |
| if (patch->type != PATCH_TYPE_ABSOLUTE) { |
| LOG(THREAD, LOG_EMIT, 2, |
| "patch_emitted_code type=%d indirected, nothing to patch\n", patch->type); |
| /* FIXME: propagate the check earlier to save the extraneous calls |
| to update_indirect_exit_stub and update_indirect_branch_lookup |
| */ |
| return; |
| } |
| DOLOG(4, LOG_EMIT, { |
| print_patch_list(patch); |
| }); |
| for(i=0; i<patch->num_relocations; i++) { |
| byte *pc = start_pc + patch->entry[i].where.offset; |
| /* value address, (think for example of pt->trace.hash_mask) */ |
| ptr_uint_t value; |
| char *vaddr = NULL; |
| if (TEST(PATCH_PER_THREAD, patch->entry[i].patch_flags)) { |
| vaddr = (char *)pt + patch->entry[i].value_location_offset; |
| } else if (TEST(PATCH_UNPROT_STAT, patch->entry[i].patch_flags)) { |
| /* separate the two parts of the stat */ |
| uint unprot_offs = (uint) (patch->entry[i].value_location_offset) >> 16; |
| uint field_offs = (uint) (patch->entry[i].value_location_offset) & 0xffff; |
| IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint |
| (patch->entry[i].value_location_offset))); |
| vaddr = (*((char **)((char *)pt + unprot_offs))) + field_offs; |
| LOG(THREAD, LOG_EMIT, 4, |
| "patch_emitted_code [%d] value "PFX" => 0x%x 0x%x => "PFX"\n", |
| i, patch->entry[i].value_location_offset, unprot_offs, field_offs, vaddr); |
| } |
| else |
| ASSERT_NOT_REACHED(); |
| ASSERT(TEST(PATCH_OFFSET_VALID, patch->entry[i].patch_flags)); |
| ASSERT(!TEST(PATCH_MARKER, patch->entry[i].patch_flags)); |
| |
| if (!TEST(PATCH_TAKE_ADDRESS, patch->entry[i].patch_flags)) { |
| /* use value pointed by computed address */ |
| if (TEST(PATCH_UINT_SIZED, patch->entry[i].patch_flags)) |
| value = (ptr_uint_t) *((uint *)vaddr); |
| else |
| value = *(ptr_uint_t*)vaddr; |
| } else { |
| ASSERT(!TEST(PATCH_UINT_SIZED, patch->entry[i].patch_flags)); |
| value = (ptr_uint_t)vaddr; /* use computed address */ |
| } |
| |
| LOG(THREAD, LOG_EMIT, 4, |
| "patch_emitted_code [%d] offset="PFX" patch_flags=%d value_offset="PFX |
| " vaddr="PFX" value="PFX"\n", i, |
| patch->entry[i].where.offset, patch->entry[i].patch_flags, |
| patch->entry[i].value_location_offset, vaddr, value); |
| if (TEST(PATCH_UINT_SIZED, patch->entry[i].patch_flags)) { |
| IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint(value))); |
| *((uint*)pc) = (uint) value; |
| } else |
| *((ptr_uint_t *)pc) = value; |
| LOG(THREAD, LOG_EMIT, 4, |
| "patch_emitted_code: updated pc *"PFX" = "PFX"\n", pc, value); |
| } |
| |
| STATS_INC(emit_patched_fragments); |
| DOSTATS({ |
| /* PR 217008: avoid gcc warning from truncation assert in XSTATS_ADD_DC */ |
| int tmp_num = patch->num_relocations; |
| STATS_ADD(emit_patched_relocations, tmp_num); |
| }); |
| LOG(THREAD, LOG_EMIT, 4, "patch_emitted_code done\n"); |
| } |
| |
| |
| /* Updates an indirect branch exit stub with the latest hashtable mask |
| * and hashtable address |
| * See also update_indirect_branch_lookup |
| */ |
| void |
| update_indirect_exit_stub(dcontext_t *dcontext, fragment_t *f, linkstub_t *l) |
| { |
| generated_code_t *code = get_emitted_routines_code |
| (dcontext _IF_X64(FRAGMENT_GENCODE_MODE(f->flags))); |
| # ifdef CUSTOM_EXIT_STUBS |
| byte *start_pc = (byte *) EXIT_FIXED_STUB_PC(dcontext, f, l); |
| # else |
| byte *start_pc = (byte *) EXIT_STUB_PC(dcontext, f, l); |
| # endif |
| ibl_branch_type_t branch_type; |
| |
| ASSERT(linkstub_owned_by_fragment(dcontext, f, l)); |
| ASSERT(LINKSTUB_INDIRECT(l->flags)); |
| ASSERT(EXIT_HAS_STUB(l->flags, f->flags)); |
| /* Shared use indirection so no patching needed -- caller should check */ |
| ASSERT(!TEST(FRAG_SHARED, f->flags)); |
| #ifdef WINDOWS |
| /* Do not touch shared_syscall */ |
| if (EXIT_TARGET_TAG(dcontext, f, l) == |
| shared_syscall_routine_ex(dcontext _IF_X64(FRAGMENT_GENCODE_MODE(f->flags)))) |
| return; |
| #endif |
| branch_type = extract_branchtype(l->flags); |
| |
| LOG(THREAD, LOG_EMIT, 4, "update_indirect_exit_stub: f->tag="PFX"\n", |
| f->tag); |
| |
| if (DYNAMO_OPTION(disable_traces) && !code->bb_ibl[branch_type].ibl_head_is_inlined) { |
| return; |
| } |
| |
| if (TEST(FRAG_IS_TRACE, f->flags)) { |
| ASSERT(code->trace_ibl[branch_type].ibl_head_is_inlined); |
| patch_emitted_code(dcontext, &code->trace_ibl[branch_type].ibl_stub_patch, start_pc); |
| } else { |
| ASSERT(code->bb_ibl[branch_type].ibl_head_is_inlined); |
| patch_emitted_code(dcontext, &code->bb_ibl[branch_type].ibl_stub_patch, start_pc); |
| } |
| } |
| |
| /*########################################################################### |
| * |
| * fragment_t Prefixes |
| * |
| * Two types: indirect branch target, which restores eflags and xcx, and |
| * normal prefix, which just restores xcx |
| */ |
| |
| /* Indirect Branch Target Prefix |
| * We have 3 different prefixes: one if we don't need to restore eflags, one |
| * if we need to restore just using sahf, and one if we also need to restore |
| * the overflow flag OF. |
| * |
| * FIXME: currently we cache-align the prefix, not the normal |
| * entry point...if prefix gets much longer, might want to add |
| * nops to get normal entry cache-aligned? |
| */ |
| |
| /* for now all ibl targets must use same scratch locations: tls or not, no mixture */ |
| |
| #define RESTORE_XAX_PREFIX(flags) \ |
| ((FRAG_IS_X86_TO_X64(flags) && \ |
| IF_X64_ELSE(DYNAMO_OPTION(x86_to_x64_ibl_opt), false)) ? \ |
| SIZE64_MOV_R8_TO_XAX : \ |
| (IBL_EFLAGS_IN_TLS() ? SIZE_MOV_XAX_TO_TLS(flags, false) : SIZE32_MOV_XAX_TO_ABS)) |
| #define PREFIX_BASE(flags) \ |
| (RESTORE_XAX_PREFIX(flags) + FRAGMENT_BASE_PREFIX_SIZE(flags)) |
| |
| |
| int |
| fragment_prefix_size(uint flags) |
| { |
| if (use_ibt_prefix(flags)) { |
| bool use_eflags_restore = TEST(FRAG_IS_TRACE, flags) ? |
| !DYNAMO_OPTION(trace_single_restore_prefix) : |
| !DYNAMO_OPTION(bb_single_restore_prefix); |
| /* The common case is !INTERNAL_OPTION(unsafe_ignore_eflags*) so |
| * PREFIX_BASE(flags) is defined accordingly, and we subtract from it to |
| * get the correct value when the option is on. |
| */ |
| if (INTERNAL_OPTION(unsafe_ignore_eflags_prefix)) { |
| if (INTERNAL_OPTION(unsafe_ignore_eflags_ibl)) { |
| ASSERT(PREFIX_BASE(flags) - RESTORE_XAX_PREFIX(flags) >= 0); |
| return PREFIX_BASE(flags) - RESTORE_XAX_PREFIX(flags); |
| } else { |
| /* still need to restore xax, just don't restore eflags */ |
| return PREFIX_BASE(flags); |
| } |
| } |
| if (!use_eflags_restore) |
| return PREFIX_BASE(flags) - RESTORE_XAX_PREFIX(flags); |
| if (TEST(FRAG_WRITES_EFLAGS_6, flags)) /* no flag restoration needed */ |
| return PREFIX_BASE(flags); |
| else if (TEST(FRAG_WRITES_EFLAGS_OF, flags)) /* no OF restoration needed */ |
| return (PREFIX_BASE(flags) + PREFIX_SIZE_FIVE_EFLAGS); |
| else /* must restore all 6 flags */ |
| if (INTERNAL_OPTION(unsafe_ignore_overflow)) { |
| /* do not restore OF */ |
| return (PREFIX_BASE(flags) + PREFIX_SIZE_FIVE_EFLAGS); |
| } else { |
| return (PREFIX_BASE(flags) + PREFIX_SIZE_RESTORE_OF + |
| PREFIX_SIZE_FIVE_EFLAGS); |
| } |
| } else { |
| #ifdef CLIENT_INTERFACE |
| if (dynamo_options.bb_prefixes) |
| return FRAGMENT_BASE_PREFIX_SIZE(flags); |
| else |
| #endif |
| return 0; |
| } |
| } |
| |
| #ifdef PROFILE_RDTSC |
| /*************************************************************************** |
| *************************************************************************** |
| ** PROFILING USING RDTSC |
| ** |
| **/ |
| /* |
| We want the profile code to not count towards fragment times. |
| So we stop time as quickly as possible, in assembly here instead of |
| in the profile_fragment_enter function, and start time again as late |
| as possible: |
| mov %eax, eax_offset(dcontext) # save eax |
| mov %edx, edx_offset(dcontext) # save edx |
| rdtsc # stop time |
| switch to dynamo stack |
| pushfl # save eflags (call will clobber) |
| mov %ecx, ecx_offset(dcontext) # save ecx |
| pushl %edx # pass time as arg |
| pushl %eax |
| pushil &fragment_address # pass &frag as arg |
| call profile_fragment_enter # |
| addl $0xc, %esp # clean up args |
| popl %ecx # restore ecx |
| popfl # restore eflags |
| restore app stack |
| rdtsc # start time |
| movl %eax, start_time_OFFS(dcontext) # store time value |
| movl %edx, 4+start_time_OFFS(dcontext) # store time value |
| mov eax_offset(dcontext), %eax # restore eax |
| mov edx_offset(dcontext), %edx # restore edx |
| mov ecx_offset(dcontext), %ecx # restore ecx |
| */ |
| |
| static uint profile_call_length = 0; |
| static int profile_call_fragment_offset = 0; |
| static int profile_call_call_offset = 0; |
| static byte profile_call_buf[128]; |
| static dcontext_t *buffer_dcontext; |
| static void build_profile_call_buffer(void); |
| |
| uint |
| profile_call_size() |
| { |
| if (profile_call_length == 0) |
| build_profile_call_buffer(); |
| return profile_call_length; |
| } |
| |
| /* if insert_profile_call emits its code into the trace buffer, this |
| * routine must be called once the fragment is created and the code is |
| * in the fcache |
| */ |
| void |
| finalize_profile_call(dcontext_t *dcontext, fragment_t *f) |
| { |
| byte *start_pc = (byte *) FCACHE_ENTRY_PC(f); |
| byte *pc; |
| byte *prev_pc; |
| instr_t instr; |
| instr_init(dcontext, &instr); |
| |
| /* fill in address of owning fragment now that that fragment exists */ |
| pc = start_pc + profile_call_fragment_offset; |
| /* PR 248210: unsupported feature on x64 */ |
| IF_X64(ASSERT_NOT_IMPLEMENTED(false)); |
| *((int *)pc) = (uint)f; |
| |
| /* fill in call's proper pc-relative offset now that code is |
| * in its final location in fcache |
| */ |
| pc = start_pc + profile_call_call_offset; |
| IF_X64(ASSERT_NOT_IMPLEMENTED(false)); |
| *((int *)pc) = (int)&profile_fragment_enter - (int)pc - 4; |
| |
| /* must fix up all dcontext references to point to the right dcontext */ |
| pc = start_pc; |
| do { |
| prev_pc = pc; |
| instr_reset(dcontext, &instr); |
| pc = decode(dcontext, pc, &instr); |
| ASSERT(instr_valid(&instr)); /* our own code! */ |
| /* look for loads and stores that reference buffer_dcontext */ |
| if (instr_get_opcode(&instr) == OP_mov_ld && |
| opnd_is_near_base_disp(instr_get_src(&instr, 0)) && |
| opnd_get_base(instr_get_src(&instr, 0)) == REG_NULL && |
| opnd_get_index(instr_get_src(&instr, 0)) == REG_NULL) { |
| /* if not really dcontext value, update_ will return old value */ |
| instr_set_src(&instr, 0, |
| update_dcontext_address(instr_get_src(&instr, 0), |
| buffer_dcontext, dcontext)); |
| } |
| else if (instr_get_opcode(&instr) == OP_mov_st && |
| opnd_is_near_base_disp(instr_get_dst(&instr, 0)) && |
| opnd_get_base(instr_get_dst(&instr, 0)) == REG_NULL && |
| opnd_get_index(instr_get_dst(&instr, 0)) == REG_NULL) { |
| /* if not really dcontext value, update_ will return old value */ |
| instr_set_dst(&instr, 0, |
| update_dcontext_address(instr_get_dst(&instr, 0), |
| buffer_dcontext, dcontext)); |
| } |
| if (!instr_raw_bits_valid(&instr)) { |
| DEBUG_DECLARE(byte *nxt_pc;) |
| DEBUG_DECLARE(nxt_pc = ) instr_encode(dcontext, &instr, prev_pc); |
| ASSERT(nxt_pc != NULL); |
| } |
| } while (pc < start_pc + profile_call_length); |
| instr_free(dcontext, &instr); |
| } |
| |
| |
| void |
| insert_profile_call(cache_pc start_pc) |
| { |
| if (profile_call_length == 0) |
| build_profile_call_buffer(); |
| memcpy((void *)start_pc, profile_call_buf, profile_call_length); |
| /* if thread-private, we change to proper dcontext when finalizing */ |
| } |
| |
| |
| /* This routine builds the profile call code using the instr_t |
| * abstraction, then emits it into a buffer to be saved. |
| * The code can then be directly copied whenever needed. |
| * Assumption: this thread's dcontext must have been created |
| * before calling this function. |
| */ |
| static void |
| build_profile_call_buffer() |
| { |
| byte *pc, *nxt_pc; |
| instrlist_t ilist; |
| instr_t *inst; |
| int start_time_offs; |
| dcontext_t *dcontext = get_thread_private_dcontext(); |
| ASSERT(dcontext != NULL); |
| /* remember dcontext for easy replacement when finalizing: */ |
| buffer_dcontext = dcontext; |
| |
| /* we require a dcontext to find this offset because it may |
| * or may not be pushed to a quadword boundary, making it |
| * hard to hardcode it |
| */ |
| start_time_offs = (int)(&(dcontext->start_time)) - (int)dcontext; |
| |
| /* initialize the ilist */ |
| instrlist_init(&ilist); |
| |
| APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EAX, SCRATCH_REG0_OFFS)); |
| APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EDX, SCRATCH_REG3_OFFS)); |
| |
| /* get time = rdtsc */ |
| APP(&ilist, INSTR_CREATE_rdtsc(dcontext)); |
| |
| /* swap to dstack */ |
| APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_ESP, XSP_OFFSET)); |
| APP(&ilist, instr_create_restore_dynamo_stack(dcontext)); |
| |
| /* finish saving caller-saved registers |
| * The profile_fragment_enter function will save the callee-saved |
| * regs (ebx, ebp, esi, edi) and will restore ebp and esp, but we need |
| * to explicitly save eax, ecx, and edx |
| */ |
| APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_ECX, SCRATCH_REG2_OFFS)); |
| |
| /* save eflags (call will clobber) */ |
| APP(&ilist, INSTR_CREATE_RAW_pushf(dcontext)); |
| |
| #ifdef WINDOWS |
| /* must preserve the LastErrorCode (if the profile procedure |
| * calls a Win32 API routine it could overwrite the app's error code) |
| * currently this is done in the profile routine itself -- |
| * if you want to move it here, look at the code in profile.c |
| */ |
| #endif |
| |
| /* push time as 2nd argument for call */ |
| APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_EDX))); |
| APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_EAX))); |
| |
| /* push fragment address as 1st argument for call |
| * fragment isn't built yet, we fill it in in finalize_profile_call |
| */ |
| APP(&ilist, INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(0))); |
| |
| /* call near rel: 4-byte pc-relative offset from start of next instr |
| * we don't have that offset now so we fill it in later (in |
| * finalize_profile_call) |
| */ |
| APP(&ilist, INSTR_CREATE_call(dcontext, opnd_create_pc(NULL))); |
| |
| /* pop arguments: addl $0xc, %esp */ |
| APP(&ilist, |
| INSTR_CREATE_add(dcontext, opnd_create_reg(REG_ESP), OPND_CREATE_INT8(0xc))); |
| |
| /* restore eflags */ |
| APP(&ilist, INSTR_CREATE_RAW_popf(dcontext)); |
| |
| /* restore caller-saved registers */ |
| APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_ECX, SCRATCH_REG2_OFFS)); |
| |
| /* restore app stack */ |
| APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_ESP, XSP_OFFSET)); |
| |
| /* get start time = rdtsc */ |
| APP(&ilist, INSTR_CREATE_rdtsc(dcontext)); |
| |
| /* copy start time into dcontext */ |
| APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EAX, start_time_offs)); |
| APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EDX, start_time_offs+4)); |
| |
| /* finish restoring caller-saved registers */ |
| APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_EDX, SCRATCH_REG3_OFFS)); |
| APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_EAX, SCRATCH_REG0_OFFS)); |
| |
| /* now encode the instructions */ |
| pc = profile_call_buf; |
| for (inst = instrlist_first(&ilist); inst; inst = instr_get_next(inst)) { |
| if (instr_is_call_direct(inst)) { |
| /* push_immed was just before us, so fragment address |
| * starts 4 bytes before us: |
| */ |
| profile_call_fragment_offset = (int) (pc - 4 - profile_call_buf); |
| /* call opcode is 1 byte, offset is next: */ |
| profile_call_call_offset = (int) (pc + 1 - profile_call_buf); |
| } |
| /* we have no jumps with instr_t targets so we don't need to set note |
| * field in order to use instr_encode |
| */ |
| nxt_pc = instr_encode(dcontext, inst, (void*)pc); |
| ASSERT(nxt_pc != NULL); |
| profile_call_length += nxt_pc - pc; |
| pc = nxt_pc; |
| ASSERT(profile_call_length < 128); |
| } |
| |
| /* free the instrlist_t elements */ |
| instrlist_clear(dcontext, &ilist); |
| } |
| |
| #endif /* PROFILE_RDTSC */ |
| |
| #ifdef WINDOWS |
| # ifdef CLIENT_INTERFACE |
| |
| /* Leaving in place old notes on LastError preservation: */ |
| /* inlined versions of save/restore last error by reading of TIB */ |
| /* If our inlined version fails on a later version of windows |
| should verify [GS]etLastError matches the disassembly below. |
| */ |
| /* Win2000: kernel32!SetLastError: */ |
| /* 77E87671: 55 push ebp */ |
| /* 77E87672: 8B EC mov ebp,esp */ |
| /* 77E87674: 64 A1 18 00 00 00 mov eax,fs:[00000018] */ |
| /* 77E8767A: 8B 4D 08 mov ecx,dword ptr [ebp+8] */ |
| /* 77E8767D: 89 48 34 mov dword ptr [eax+34h],ecx */ |
| /* 77E87680: 5D pop ebp */ |
| /* 77E87681: C2 04 00 ret 4 */ |
| |
| /* Win2003: ntdll!RtlSetLastWin32Error: optimized to */ |
| /* 77F45BB4: 64 A1 18 00 00 00 mov eax,fs:[00000018] */ |
| /* 77F45BBA: 8B 4C 24 04 mov ecx,dword ptr [esp+4] */ |
| /* 77F45BBE: 89 48 34 mov dword ptr [eax+34h],ecx */ |
| /* 77F45BC1: C2 04 00 ret 4 */ |
| |
| /* See InsideWin2k, p. 329 SelfAddr fs:[18h] simply has the linear address of the TIB |
| while we're interested only in LastError which is at fs:[34h] */ |
| /* Therefore all we need is a single instruction! */ |
| /* 64 a1 34 00 00 00 mov dword ptr fs:[34h],errno_register */ |
| /* Overall savings: 7 instructions, 5 data words */ |
| |
| /*kernel32!GetLastError:*/ |
| /* 77E87684: 64 A1 18 00 00 00 mov eax,fs:[00000018] */ |
| /* 77E8768A: 8B 40 34 mov eax,dword ptr [eax+34h] */ |
| /* 77E8768D: C3 ret */ |
| |
| /* All we need is a single instruction: */ |
| /* 77F45BBE: 89 48 34 mov reg_result, dword ptr fs:[34h] */ |
| |
| /* i#249: isolate app's PEB+TEB by keeping our own copy and swapping on cxt switch |
| * For clean calls we share this in clean_call_{save,restore} (i#171, i#1349). |
| */ |
| void |
| preinsert_swap_peb(dcontext_t *dcontext, instrlist_t *ilist, instr_t *next, |
| bool absolute, reg_id_t reg_dr, reg_id_t reg_scratch, bool to_priv) |
| { |
| /* We assume PEB is globally constant and we don't need per-thread pointers |
| * and can use use absolute pointers known at init time |
| */ |
| PEB *tgt_peb = to_priv ? get_private_peb() : get_own_peb(); |
| reg_id_t scratch32 = IF_X64_ELSE(reg_64_to_32(reg_scratch), reg_scratch); |
| ASSERT(INTERNAL_OPTION(private_peb) && should_swap_peb_pointer()); |
| ASSERT(reg_dr != REG_NULL && reg_scratch != REG_NULL); |
| /* can't store 64-bit immed, so we use scratch reg, for 32-bit too since |
| * long 32-bit-immed-store instr to fs:offs is slow to decode |
| */ |
| PRE(ilist, next, INSTR_CREATE_mov_imm |
| (dcontext, opnd_create_reg(reg_scratch), OPND_CREATE_INTPTR((ptr_int_t)tgt_peb))); |
| PRE(ilist, next, XINST_CREATE_store |
| (dcontext, opnd_create_far_base_disp |
| (SEG_TLS, REG_NULL, REG_NULL, 0, PEB_TIB_OFFSET, OPSZ_PTR), |
| opnd_create_reg(reg_scratch))); |
| |
| /* Preserve app's TEB->LastErrorValue. We used to do this separately b/c |
| * DR at one point long ago made some win32 API calls: now we only have to |
| * do this when loading private libraries. We assume no private library |
| * code needs to preserve LastErrorCode across app execution. |
| */ |
| if (to_priv) { |
| /* yes errno is 32 bits even on x64 */ |
| PRE(ilist, next, XINST_CREATE_load |
| (dcontext, opnd_create_reg(scratch32), opnd_create_far_base_disp |
| (SEG_TLS, REG_NULL, REG_NULL, 0, ERRNO_TIB_OFFSET, OPSZ_4))); |
| PRE(ilist, next, SAVE_TO_DC_VIA_REG |
| (absolute, dcontext, reg_dr, scratch32, APP_ERRNO_OFFSET)); |
| } else { |
| PRE(ilist, next, RESTORE_FROM_DC_VIA_REG |
| (absolute, dcontext, reg_dr, scratch32, APP_ERRNO_OFFSET)); |
| PRE(ilist, next, XINST_CREATE_store |
| (dcontext, opnd_create_far_base_disp |
| (SEG_TLS, REG_NULL, REG_NULL, 0, ERRNO_TIB_OFFSET, OPSZ_4), |
| opnd_create_reg(scratch32))); |
| } |
| |
| #ifdef X64 |
| /* We have to swap TEB->StackLimit (i#1102). For now I'm only doing this |
| * on X64, though it seems possible for 32-bit stacks to be up high too? |
| * We have never seen that. |
| */ |
| if (to_priv) { |
| PRE(ilist, next, XINST_CREATE_load |
| (dcontext, opnd_create_reg(reg_scratch), opnd_create_far_base_disp |
| (SEG_TLS, REG_NULL, REG_NULL, 0, BASE_STACK_TIB_OFFSET, OPSZ_PTR))); |
| PRE(ilist, next, SAVE_TO_DC_VIA_REG |
| (absolute, dcontext, reg_dr, reg_scratch, APP_STACK_LIMIT_OFFSET)); |
| PRE(ilist, next, RESTORE_FROM_DC_VIA_REG |
| (absolute, dcontext, reg_dr, reg_scratch, DSTACK_OFFSET)); |
| PRE(ilist, next, INSTR_CREATE_lea |
| (dcontext, opnd_create_reg(reg_scratch), |
| opnd_create_base_disp(reg_scratch, REG_NULL, 0, |
| -(int)DYNAMORIO_STACK_SIZE, OPSZ_lea))); |
| PRE(ilist, next, XINST_CREATE_store |
| (dcontext, opnd_create_far_base_disp |
| (SEG_TLS, REG_NULL, REG_NULL, 0, BASE_STACK_TIB_OFFSET, OPSZ_PTR), |
| opnd_create_reg(reg_scratch))); |
| } else { |
| PRE(ilist, next, RESTORE_FROM_DC_VIA_REG |
| (absolute, dcontext, reg_dr, reg_scratch, APP_STACK_LIMIT_OFFSET)); |
| PRE(ilist, next, XINST_CREATE_store |
| (dcontext, opnd_create_far_base_disp |
| (SEG_TLS, REG_NULL, REG_NULL, 0, BASE_STACK_TIB_OFFSET, OPSZ_PTR), |
| opnd_create_reg(reg_scratch))); |
| } |
| #endif |
| |
| /* We also swap TEB->NlsCache. Unlike TEB->ProcessEnvironmentBlock, which is |
| * constant, and TEB->LastErrorCode, which is not peristent, we have to maintain |
| * both values and swap between them which is expensive. |
| */ |
| PRE(ilist, next, XINST_CREATE_load |
| (dcontext, opnd_create_reg(reg_scratch), opnd_create_far_base_disp |
| (SEG_TLS, REG_NULL, REG_NULL, 0, NLS_CACHE_TIB_OFFSET, OPSZ_PTR))); |
| PRE(ilist, next, SAVE_TO_DC_VIA_REG |
| (absolute, dcontext, reg_dr, reg_scratch, |
| to_priv ? APP_NLS_CACHE_OFFSET : PRIV_NLS_CACHE_OFFSET)); |
| PRE(ilist, next, RESTORE_FROM_DC_VIA_REG |
| (absolute, dcontext, reg_dr, reg_scratch, |
| to_priv ? PRIV_NLS_CACHE_OFFSET : APP_NLS_CACHE_OFFSET)); |
| PRE(ilist, next, XINST_CREATE_store |
| (dcontext, opnd_create_far_base_disp |
| (SEG_TLS, REG_NULL, REG_NULL, 0, NLS_CACHE_TIB_OFFSET, OPSZ_PTR), |
| opnd_create_reg(reg_scratch))); |
| /* We also swap TEB->FlsData. Unlike TEB->ProcessEnvironmentBlock, which is |
| * constant, and TEB->LastErrorCode, which is not peristent, we have to maintain |
| * both values and swap between them which is expensive. |
| */ |
| PRE(ilist, next, XINST_CREATE_load |
| (dcontext, opnd_create_reg(reg_scratch), opnd_create_far_base_disp |
| (SEG_TLS, REG_NULL, REG_NULL, 0, FLS_DATA_TIB_OFFSET, OPSZ_PTR))); |
| PRE(ilist, next, SAVE_TO_DC_VIA_REG |
| (absolute, dcontext, reg_dr, reg_scratch, |
| to_priv ? APP_FLS_OFFSET : PRIV_FLS_OFFSET)); |
| PRE(ilist, next, RESTORE_FROM_DC_VIA_REG |
| (absolute, dcontext, reg_dr, reg_scratch, |
| to_priv ? PRIV_FLS_OFFSET : APP_FLS_OFFSET)); |
| PRE(ilist, next, XINST_CREATE_store |
| (dcontext, opnd_create_far_base_disp |
| (SEG_TLS, REG_NULL, REG_NULL, 0, FLS_DATA_TIB_OFFSET, OPSZ_PTR), |
| opnd_create_reg(reg_scratch))); |
| /* We swap TEB->ReservedForNtRpc as well. Hopefully there won't be many |
| * more we'll have to swap. |
| */ |
| PRE(ilist, next, XINST_CREATE_load |
| (dcontext, opnd_create_reg(reg_scratch), opnd_create_far_base_disp |
| (SEG_TLS, REG_NULL, REG_NULL, 0, NT_RPC_TIB_OFFSET, OPSZ_PTR))); |
| PRE(ilist, next, SAVE_TO_DC_VIA_REG |
| (absolute, dcontext, reg_dr, reg_scratch, |
| to_priv ? APP_RPC_OFFSET : PRIV_RPC_OFFSET)); |
| PRE(ilist, next, RESTORE_FROM_DC_VIA_REG |
| (absolute, dcontext, reg_dr, reg_scratch, |
| to_priv ? PRIV_RPC_OFFSET : APP_RPC_OFFSET)); |
| PRE(ilist, next, XINST_CREATE_store |
| (dcontext, opnd_create_far_base_disp |
| (SEG_TLS, REG_NULL, REG_NULL, 0, NT_RPC_TIB_OFFSET, OPSZ_PTR), |
| opnd_create_reg(reg_scratch))); |
| } |
| # endif /* CLIENT_INTERFACE */ |
| |
| #endif /* WINDOWS */ |
| |
| /***************************************************************************/ |
| /* THREAD-PRIVATE/SHARED ROUTINE GENERATION */ |
| /***************************************************************************/ |
| |
| /* Export this in instr.h if it becomes useful elsewhere */ |
| #ifdef X86 |
| # ifdef X64 |
| # ifdef WINDOWS |
| # define OPND_ARG1 opnd_create_reg(REG_RCX) |
| # else |
| # define OPND_ARG1 opnd_create_reg(REG_RDI) |
| # endif /* Win/Unix */ |
| # else |
| # define OPND_ARG1 OPND_CREATE_MEM32(REG_ESP, 4) |
| # endif /* 64/32-bit */ |
| #elif defined(ARM) |
| # define OPND_ARG1 opnd_create_reg(DR_REG_R0) |
| #endif /* X86/ARM */ |
| |
| /* register for holding dcontext on fcache enter/return */ |
| #define REG_DCTXT SCRATCH_REG5 |
| |
| /* append instructions to setup fcache target |
| * if (!absolute) |
| * # put target somewhere we can be absolute about |
| * RESTORE_FROM_UPCONTEXT next_tag_OFFSET,%xax |
| * if (shared) |
| * mov %xax,fs:xax_OFFSET |
| * endif |
| * endif |
| */ |
| static void |
| append_setup_fcache_target(dcontext_t *dcontext, instrlist_t *ilist, |
| bool absolute, bool shared) |
| { |
| if (absolute) |
| return; |
| |
| /* put target into special slot that we can be absolute about */ |
| APP(ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG0, NEXT_TAG_OFFSET)); |
| if (shared) { |
| APP(ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG0, FCACHE_ENTER_TARGET_SLOT)); |
| } else { |
| #ifdef WINDOWS |
| /* absolute into main dcontext (not one in REG_DCTXT) */ |
| APP(ilist, instr_create_save_to_dcontext(dcontext, SCRATCH_REG0, |
| NONSWAPPED_SCRATCH_OFFSET)); |
| #else |
| /* no special scratch slot! */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| #endif /* !WINDOWS */ |
| } |
| } |
| |
| /* append instructions to jump to target in code cache |
| * ifdef X64 and (target is x86 mode) |
| * # we can't indirect through a register since we couldn't restore |
| * # the high bits (PR 283152) |
| * mov gencode-jmp86-value, fs:xbx_OFFSET |
| * far jmp to next instr, stored w/ 32-bit cs selector in fs:xbx_OFFSET |
| * endif |
| * |
| * # jump indirect through dcontext->next_tag, set by dispatch() |
| * if (absolute) |
| * JUMP_VIA_DCONTEXT next_tag_OFFSET |
| * else |
| * if (shared) |
| * jmp *fs:xax_OFFSET |
| * else |
| * JUMP_VIA_DCONTEXT nonswapped_scratch_OFFSET |
| * endif |
| * endif |
| */ |
| static void |
| append_jmp_to_fcache_target(dcontext_t *dcontext, instrlist_t *ilist, |
| generated_code_t *code, |
| bool absolute, bool shared, patch_list_t *patch |
| _IF_X64(byte **jmp86_store_addr) |
| _IF_X64(byte **jmp86_target_addr)) |
| { |
| #ifdef X86_64 |
| if (GENCODE_IS_X86(code->gencode_mode)) { |
| instr_t *label = INSTR_CREATE_label(dcontext); |
| instr_t *store; |
| /* We must use an indirect jmp (far direct are illegal in x64) and |
| * we can't indirect through a register since we couldn't restore the |
| * high bits (PR 283152) so we write the 6-byte far address to TLS. |
| */ |
| /* AMD only supports 32-bit address for far jmp */ |
| store = XINST_CREATE_store(dcontext, |
| OPND_TLS_FIELD_SZ(TLS_SLOT_REG1, OPSZ_4), |
| OPND_CREATE_INT32(0/*placeholder*/)); |
| APP(ilist, store); |
| APP(ilist, XINST_CREATE_store(dcontext, |
| OPND_TLS_FIELD_SZ(TLS_SLOT_REG1+4, OPSZ_2), |
| OPND_CREATE_INT16((ushort)CS32_SELECTOR))); |
| APP(ilist, INSTR_CREATE_jmp_far_ind(dcontext, |
| OPND_TLS_FIELD_SZ(TLS_SLOT_REG1, OPSZ_6))); |
| APP(ilist, label); |
| /* We need a patch that involves two instrs, which is not supported, |
| * so we get both addresses involved into local vars and do the patch |
| * by hand after emitting. |
| */ |
| add_patch_marker(patch, store, PATCH_ASSEMBLE_ABSOLUTE, |
| -4 /* 4 bytes from end */, (ptr_uint_t*)jmp86_store_addr); |
| add_patch_marker(patch, label, PATCH_ASSEMBLE_ABSOLUTE, |
| 0 /* start of label */, (ptr_uint_t*)jmp86_target_addr); |
| } |
| #endif /* X64 */ |
| |
| /* Jump indirect through next_tag. Dispatch set this value with |
| * where we want to go next in the fcache_t. |
| */ |
| if (absolute) { |
| APP(ilist, instr_create_jump_via_dcontext(dcontext, NEXT_TAG_OFFSET)); |
| } else { |
| if (shared) { |
| /* next_tag placed into tls slot earlier in this routine */ |
| APP(ilist, |
| XINST_CREATE_jump_mem(dcontext, |
| OPND_TLS_FIELD(FCACHE_ENTER_TARGET_SLOT))); |
| |
| } else { |
| #ifdef WINDOWS |
| /* FIXME: we could just use tls, right? no real need for the "shared" |
| * parameter? |
| */ |
| /* need one absolute ref using main dcontext (not one in edi): |
| * it's the final jmp, using the special slot we set up earlier |
| */ |
| APP(ilist, instr_create_jump_via_dcontext(dcontext, |
| NONSWAPPED_SCRATCH_OFFSET)); |
| #else /* !WINDOWS */ |
| /* no special scratch slot! */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| #endif /* !WINDOWS */ |
| } |
| } |
| } |
| |
| /* Our context switch to and from the fragment cache are arranged such |
| * that there is no persistent state kept on the dstack, allowing us to |
| * start with a clean slate on exiting the cache. This eliminates the |
| * need to protect our dstack from inadvertent or malicious writes. |
| * |
| * We do not bother to save any DynamoRIO state, even the eflags. We clear |
| * them in fcache_return, assuming that a cleared state is always the |
| * proper value (df is never set across the cache, etc.) |
| * |
| * The code is split into several helper functions. |
| * |
| * # Used by dispatch to begin execution in fcache at dcontext->next_tag |
| * fcache_enter(dcontext_t *dcontext) |
| * |
| * if (!absolute) |
| * mov ARG1, SCRATCH_REG5 # dcontext param |
| * if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) |
| * RESTORE_FROM_UPCONTEXT PROT_OFFSET, %xsi |
| * endif |
| * endif |
| * |
| * # append_setup_fcache_target |
| * if (!absolute) |
| * # put target somewhere we can be absolute about |
| * RESTORE_FROM_UPCONTEXT next_tag_OFFSET, SCRATCH_REG0 |
| * if (shared) |
| * mov SCRATCH_REG0, fs:xax_OFFSET |
| * endif |
| * endif |
| * |
| * # append_call_exit_dr_hook |
| * if (EXIT_DR_HOOK != NULL && !dcontext->ignore_enterexit) |
| * if (!absolute) |
| * push %xdi |
| * push %xsi |
| * else |
| * # support for skipping the hook |
| * RESTORE_FROM_UPCONTEXT ignore_enterexit_OFFSET,%edi |
| * cmpl %edi,0 |
| * jnz post_hook |
| * endif |
| * call EXIT_DR_HOOK # for x64 windows, reserve 32 bytes stack space for call |
| * if (!absolute) |
| * pop %xsi |
| * pop %xdi |
| * endif |
| * endif |
| * |
| * post_hook: |
| * |
| * # restore the original register state |
| * |
| * # append_restore_xflags |
| * RESTORE_FROM_UPCONTEXT xflags_OFFSET,%xax |
| * push %xax |
| * popf # restore eflags temporarily using dstack |
| * |
| * # append_restore_simd_reg |
| * if preserve_xmm_caller_saved |
| * RESTORE_FROM_UPCONTEXT xmm_OFFSET+0*16,%xmm0 |
| * RESTORE_FROM_UPCONTEXT xmm_OFFSET+1*16,%xmm1 |
| * RESTORE_FROM_UPCONTEXT xmm_OFFSET+2*16,%xmm2 |
| * RESTORE_FROM_UPCONTEXT xmm_OFFSET+3*16,%xmm3 |
| * RESTORE_FROM_UPCONTEXT xmm_OFFSET+4*16,%xmm4 |
| * RESTORE_FROM_UPCONTEXT xmm_OFFSET+5*16,%xmm5 |
| * RESTORE_FROM_UPCONTEXT xmm_OFFSET+6*16,%xmm6 # 32-bit Linux |
| * RESTORE_FROM_UPCONTEXT xmm_OFFSET+7*16,%xmm7 # 32-bit Linux |
| * endif |
| * |
| * # append_restore_gpr |
| * ifdef X64 |
| * RESTORE_FROM_UPCONTEXT r8_OFFSET,%r8 |
| * RESTORE_FROM_UPCONTEXT r9_OFFSET,%r9 |
| * RESTORE_FROM_UPCONTEXT r10_OFFSET,%r10 |
| * RESTORE_FROM_UPCONTEXT r11_OFFSET,%r11 |
| * RESTORE_FROM_UPCONTEXT r12_OFFSET,%r12 |
| * RESTORE_FROM_UPCONTEXT r13_OFFSET,%r13 |
| * RESTORE_FROM_UPCONTEXT r14_OFFSET,%r14 |
| * RESTORE_FROM_UPCONTEXT r15_OFFSET,%r15 |
| * endif |
| * RESTORE_FROM_UPCONTEXT xax_OFFSET,%xax |
| * RESTORE_FROM_UPCONTEXT xbx_OFFSET,%xbx |
| * RESTORE_FROM_UPCONTEXT xcx_OFFSET,%xcx |
| * RESTORE_FROM_UPCONTEXT xdx_OFFSET,%xdx |
| * if (absolute || !TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) |
| * RESTORE_FROM_UPCONTEXT xsi_OFFSET,%xsi |
| * endif |
| * if (absolute || TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) |
| * RESTORE_FROM_UPCONTEXT xdi_OFFSET,%xdi |
| * endif |
| * RESTORE_FROM_UPCONTEXT xbp_OFFSET,%xbp |
| * RESTORE_FROM_UPCONTEXT xsp_OFFSET,%xsp |
| * if (!absolute) |
| * if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) |
| * RESTORE_FROM_UPCONTEXT xsi_OFFSET,%xsi |
| * else |
| * RESTORE_FROM_UPCONTEXT xdi_OFFSET,%xdi |
| * endif |
| * endif |
| * |
| * # append_jmp_to_fcache_target |
| * ifdef X64 and (target is x86 mode) |
| * # we can't indirect through a register since we couldn't restore |
| * # the high bits (PR 283152) |
| * mov gencode-jmp86-value, fs:xbx_OFFSET |
| * far jmp to next instr, stored w/ 32-bit cs selector in fs:xbx_OFFSET |
| * endif |
| * |
| * # jump indirect through dcontext->next_tag, set by dispatch() |
| * if (absolute) |
| * JUMP_VIA_DCONTEXT next_tag_OFFSET |
| * else |
| * if (shared) |
| * jmp *fs:xax_OFFSET |
| * else |
| * JUMP_VIA_DCONTEXT nonswapped_scratch_OFFSET |
| * endif |
| * endif |
| * |
| * # now executing in fcache |
| */ |
| static byte * |
| emit_fcache_enter_common(dcontext_t *dcontext, generated_code_t *code, |
| byte *pc, bool absolute, bool shared) |
| { |
| int len; |
| instrlist_t ilist; |
| patch_list_t patch; |
| #ifdef X64 |
| byte *jmp86_store_addr = NULL; |
| byte *jmp86_target_addr = NULL; |
| #endif /* X64 */ |
| |
| init_patch_list(&patch, absolute ? PATCH_TYPE_ABSOLUTE : PATCH_TYPE_INDIRECT_XDI); |
| instrlist_init(&ilist); |
| |
| /* no support for absolute addresses on x64/ARM: we always use tls */ |
| IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute && shared)); |
| IF_ARM(ASSERT_NOT_IMPLEMENTED(!absolute && shared)); |
| |
| if (!absolute) { |
| /* grab gen routine's parameter dcontext and put it into edi */ |
| APP(&ilist, |
| IF_X86_ELSE(XINST_CREATE_load, XINST_CREATE_move) |
| (dcontext, opnd_create_reg(SCRATCH_REG5), OPND_ARG1)); |
| if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) { |
| IF_X86_ELSE({ |
| APP(&ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG4, PROT_OFFS)); |
| }, { |
| /* FIXME i#1551: SELFPROT is not supported on ARM */ |
| ASSERT_NOT_REACHED(); |
| }); |
| } |
| } |
| |
| append_setup_fcache_target(dcontext, &ilist, absolute, shared); |
| append_call_exit_dr_hook(dcontext, &ilist, absolute, shared); |
| |
| #if defined(WINDOWS) && defined(CLIENT_INTERFACE) |
| /* i#249: isolate the PEB */ |
| if (INTERNAL_OPTION(private_peb) && should_swap_peb_pointer()) { |
| preinsert_swap_peb(dcontext, &ilist, NULL, absolute, SCRATCH_REG5, |
| SCRATCH_REG0/*scratch*/, false/*to app*/); |
| } |
| #endif |
| |
| /* restore the original register state */ |
| append_restore_xflags(dcontext, &ilist, absolute); |
| append_restore_simd_reg(dcontext, &ilist, absolute); |
| append_restore_gpr(dcontext, &ilist, absolute); |
| append_jmp_to_fcache_target(dcontext, &ilist, code, absolute, shared, &patch |
| _IF_X64(&jmp86_store_addr) |
| _IF_X64(&jmp86_target_addr)); |
| |
| /* now encode the instructions */ |
| len = encode_with_patch_list(dcontext, &patch, &ilist, pc); |
| ASSERT(len != 0); |
| |
| #ifdef X64 |
| if (GENCODE_IS_X86(code->gencode_mode)) { |
| /* Put the absolute address in place */ |
| ASSERT(jmp86_target_addr != NULL && jmp86_store_addr != NULL); |
| ASSERT(CHECK_TRUNCATE_TYPE_uint((ptr_uint_t)jmp86_target_addr)); |
| *((uint *)jmp86_store_addr) = (uint)(ptr_uint_t)jmp86_target_addr; |
| } |
| #endif |
| |
| /* free the instrlist_t elements */ |
| instrlist_clear(dcontext, &ilist); |
| |
| return pc + len; |
| } |
| |
| byte * |
| emit_fcache_enter(dcontext_t *dcontext, generated_code_t *code, byte *pc) |
| { |
| return emit_fcache_enter_common(dcontext, code, pc, |
| true/*absolute*/, false/*!shared*/); |
| } |
| |
| /* Generate a shared prologue for grabbing the dcontext into XDI |
| |
| TODO: Should be used by fcache_return and shared IBL routines, |
| but for now some assumptions are not quite the same. |
| |
| Only assumption is that xcx cannot be touched (IBL expects looked up address) |
| if save_xdi we assume DCONTEXT_BASE_SPILL_SLOT can be clobbered |
| |
| OUTPUT: xdi contains dcontext |
| if save_xdi DCONTEXT_BASE_SPILL_SLOT will contain saved value |
| FIXME: xdx is the spill slot -- switch over to xdx as base reg? |
| Have to measure perf effect first (case 5239) |
| |
| 00: mov xdi, tls_slot_scratch2 64 89 3d 0c 0f 00 00 mov %edi -> %fs:0xf0c |
| 07: mov tls_slot_dcontext, xdi 64 8b 3d 14 0f 00 00 mov %fs:0xf14 -> %edi |
| if TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask) |
| ASSERT_NOT_TESTED |
| endif |
| */ |
| void |
| insert_shared_get_dcontext(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where, |
| bool save_xdi) |
| { |
| /* needed to support grabbing the dcontext w/ shared cache */ |
| if (save_xdi) { |
| PRE(ilist, where, SAVE_TO_TLS(dcontext, SCRATCH_REG5/*xdi/r5*/, |
| DCONTEXT_BASE_SPILL_SLOT)); |
| } |
| PRE(ilist, where, RESTORE_FROM_TLS(dcontext, SCRATCH_REG5/*xdi/r5*/, |
| TLS_DCONTEXT_SLOT)); |
| if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) { |
| #ifdef X86 |
| bool absolute = false; |
| /* PR 224798: we could avoid extra indirection by storing |
| * unprotected_context_t in TLS_DCONTEXT_SLOT instead of dcontext_t |
| */ |
| ASSERT_NOT_TESTED(); |
| /* we'd need a 3rd slot in order to nicely get unprot ptr into esi |
| * we can do it w/ only 2 slots by clobbering dcontext ptr |
| * (we could add base reg info to RESTORE_FROM_DC/SAVE_TO_DC and go |
| * straight through esi to begin w/ and subtract one instr (xchg) |
| */ |
| PRE(ilist, where, RESTORE_FROM_DC(dcontext, SCRATCH_REG5, PROT_OFFS)); |
| PRE(ilist, where, INSTR_CREATE_xchg(dcontext, opnd_create_reg(SCRATCH_REG4), |
| opnd_create_reg(SCRATCH_REG5))); |
| PRE(ilist, where, SAVE_TO_DC(dcontext, SCRATCH_REG5, SCRATCH_REG4_OFFS)); |
| PRE(ilist, where, RESTORE_FROM_TLS(dcontext, SCRATCH_REG5, TLS_DCONTEXT_SLOT)); |
| #elif defined(ARM) |
| /* FIXMED i#1551: NYI on ARM */ |
| ASSERT_NOT_REACHED(); |
| #endif |
| } |
| } |
| |
| |
| /* restore XDI through TLS */ |
| void |
| insert_shared_restore_dcontext_reg(dcontext_t *dcontext, instrlist_t *ilist, |
| instr_t *where) |
| { |
| PRE(ilist, where, RESTORE_FROM_TLS(dcontext, SCRATCH_REG5/*xdi/r5*/, |
| DCONTEXT_BASE_SPILL_SLOT)); |
| } |
| |
| |
| /* append instructions to prepare for fcache return: |
| * i.e., far jump to switch mode, load dcontext, etc. |
| * |
| * # on X86 |
| * ifdef X64 and (source is x86 mode) |
| * far direct jmp to next instr w/ 64-bit switch |
| * endif |
| * |
| * if (!absolute) |
| * mov %xdi,fs:xdx_OFFSET |
| * mov fs:dcontext,%xdi |
| * if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) |
| * RESTORE_FROM_DCONTEXT PROT_OFFSET,%xdi |
| * xchg %xsi,%xdi |
| * SAVE_TO_UPCONTEXT %xdi,xsi_OFFSET |
| * mov fs:dcontext,%xdi |
| * endif |
| * # get xax and xdi into their real slots, via xbx |
| * SAVE_TO_UPCONTEXT %xbx,xbx_OFFSET |
| * mov fs:xax_OFFSET,%xbx |
| * SAVE_TO_UPCONTEXT %xbx,xax_OFFSET |
| * mov fs:xdx_OFFSET,%xbx |
| * SAVE_TO_UPCONTEXT %xbx,xdi_OFFSET |
| * endif |
| */ |
| static void |
| append_prepare_fcache_return(dcontext_t *dcontext, instrlist_t *ilist, |
| bool absolute, bool shared) |
| { |
| #ifdef X86_64 |
| if (GENCODE_IS_X86(code->gencode_mode)) { |
| instr_t *label = INSTR_CREATE_label(dcontext); |
| instr_t *ljmp = INSTR_CREATE_jmp_far |
| (dcontext, opnd_create_far_instr(CS64_SELECTOR, label)); |
| instr_set_x86_mode(ljmp, true/*x86*/); |
| APP(ilist, ljmp); |
| APP(ilist, label); |
| } |
| #endif /* X86_64 */ |
| |
| if (absolute) |
| return; |
| |
| /* only support non-absolute w/ shared cache */ |
| ASSERT_NOT_IMPLEMENTED(shared); |
| /* xax is in 1 scratch slot, so we have to use a 2nd scratch |
| * slot in order to get dcontext into xdi |
| */ |
| APP(ilist, SAVE_TO_TLS(dcontext, REG_DCTXT, DCONTEXT_BASE_SPILL_SLOT)); |
| APP(ilist, RESTORE_FROM_TLS(dcontext, REG_DCTXT, TLS_DCONTEXT_SLOT)); |
| if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) { |
| #ifdef X86 |
| /* we'd need a 3rd slot in order to nicely get unprot ptr into xsi |
| * we can do it w/ only 2 slots by clobbering dcontext ptr |
| * (we could add base reg info to RESTORE_FROM_DC/SAVE_TO_DC and go |
| * straight through xsi to begin w/ and subtract one instr (xchg) |
| */ |
| ASSERT_NOT_TESTED(); |
| APP(ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG5, PROT_OFFS)); |
| APP(ilist, INSTR_CREATE_xchg(dcontext, opnd_create_reg(SCRATCH_REG4), |
| opnd_create_reg(SCRATCH_REG5))); |
| APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG5, SCRATCH_REG4_OFFS)); |
| APP(ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG5, TLS_DCONTEXT_SLOT)); |
| #elif defined(ARM) |
| /* FIXME i#1551: NYI on ARM */ |
| ASSERT_NOT_REACHED(); |
| #endif /* X86/ARM */ |
| } |
| } |
| |
| static void |
| append_call_dispatch(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) |
| { |
| /* call central dispatch routine */ |
| /* for x64 linux we could optimize and avoid the "mov rdi, rdi" */ |
| dr_insert_call((void *)dcontext, ilist, NULL/*append*/, |
| (void *)dispatch, 1, |
| absolute ? |
| OPND_CREATE_INTPTR((ptr_int_t)dcontext) : opnd_create_reg(REG_DCTXT)); |
| |
| /* dispatch() shouldn't return! */ |
| insert_reachable_cti(dcontext, ilist, NULL, vmcode_get_start(), |
| (byte *)unexpected_return, true/*jmp*/, false/*!precise*/, |
| DR_REG_R11/*scratch*/, NULL); |
| } |
| |
| /* |
| * # fcache_return: context switch back to DynamoRIO. |
| * # Invoked via |
| * # a) from the fcache via a fragment exit stub, |
| * # b) from indirect_branch_lookup(). |
| * # Invokes dispatch() with a clean dstack. |
| * # Assumptions: |
| * # 1) app's value in xax/r0 already saved in dcontext. |
| * # 2) xax/r0 holds the linkstub ptr |
| * # |
| * |
| * fcache_return: |
| * # append_prepare_fcache_return |
| * ifdef X64 and (source is x86 mode) |
| * far direct jmp to next instr w/ 64-bit switch |
| * endif |
| * |
| * if (!absolute) |
| * mov %xdi,fs:xdx_OFFSET |
| * mov fs:dcontext,%xdi |
| * if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) |
| * RESTORE_FROM_DCONTEXT PROT_OFFSET,%xdi |
| * xchg %xsi,%xdi |
| * SAVE_TO_UPCONTEXT %xdi,xsi_OFFSET |
| * mov fs:dcontext,%xdi |
| * endif |
| * endif |
| * |
| * # append_save_gpr |
| * if (!absolute) |
| * # get xax and xdi into their real slots, via xbx |
| * SAVE_TO_UPCONTEXT %xbx,xbx_OFFSET |
| * mov fs:xax_OFFSET,%xbx |
| * SAVE_TO_UPCONTEXT %xbx,xax_OFFSET |
| * mov fs:xdx_OFFSET,%xbx |
| * SAVE_TO_UPCONTEXT %xbx,xdi_OFFSET |
| * endif |
| * |
| * # save the current register state to context->regs |
| * # xax already in context |
| * |
| * if (absolute) |
| * SAVE_TO_UPCONTEXT %xbx,xbx_OFFSET |
| * endif |
| * SAVE_TO_UPCONTEXT %xcx,xcx_OFFSET |
| * SAVE_TO_UPCONTEXT %xdx,xdx_OFFSET |
| * if (absolute || !TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) |
| * SAVE_TO_UPCONTEXT %xsi,xsi_OFFSET |
| * endif |
| * if (absolute) |
| * SAVE_TO_UPCONTEXT %xdi,xdi_OFFSET |
| * endif |
| * SAVE_TO_UPCONTEXT %xbp,xbp_OFFSET |
| * SAVE_TO_UPCONTEXT %xsp,xsp_OFFSET |
| * ifdef X64 |
| * SAVE_TO_UPCONTEXT %r8,r8_OFFSET |
| * SAVE_TO_UPCONTEXT %r9,r9_OFFSET |
| * SAVE_TO_UPCONTEXT %r10,r10_OFFSET |
| * SAVE_TO_UPCONTEXT %r11,r11_OFFSET |
| * SAVE_TO_UPCONTEXT %r12,r12_OFFSET |
| * SAVE_TO_UPCONTEXT %r13,r13_OFFSET |
| * SAVE_TO_UPCONTEXT %r14,r14_OFFSET |
| * SAVE_TO_UPCONTEXT %r15,r15_OFFSET |
| * endif |
| * |
| * # append_save_simd_reg |
| * if preserve_xmm_caller_saved |
| * SAVE_TO_UPCONTEXT %xmm0,xmm_OFFSET+0*16 |
| * SAVE_TO_UPCONTEXT %xmm1,xmm_OFFSET+1*16 |
| * SAVE_TO_UPCONTEXT %xmm2,xmm_OFFSET+2*16 |
| * SAVE_TO_UPCONTEXT %xmm3,xmm_OFFSET+3*16 |
| * SAVE_TO_UPCONTEXT %xmm4,xmm_OFFSET+4*16 |
| * SAVE_TO_UPCONTEXT %xmm5,xmm_OFFSET+5*16 |
| * SAVE_TO_UPCONTEXT %xmm6,xmm_OFFSET+6*16 # 32-bit Linux |
| * SAVE_TO_UPCONTEXT %xmm7,xmm_OFFSET+7*16 # 32-bit Linux |
| * endif |
| * |
| * # switch to clean dstack |
| * RESTORE_FROM_DCONTEXT dstack_OFFSET,%xsp |
| * |
| * # append_save_clear_xflags |
| * # now save eflags -- too hard to do without a stack! |
| * pushf # push eflags on stack |
| * pop %xbx # grab eflags value |
| * SAVE_TO_UPCONTEXT %xbx,xflags_OFFSET # save eflags value |
| * |
| * # clear eflags now to avoid app's eflags messing up our ENTER_DR_HOOK |
| * # FIXME: this won't work at CPL0 if we ever run there! |
| * push 0 |
| * popf |
| * |
| * # append_call_enter_dr_hook |
| * if (ENTER_DR_HOOK != NULL && !dcontext->ignore_enterexit) |
| * # don't bother to save any registers around call except for xax |
| * # and xcx, which holds next_tag |
| * push %xcx |
| * if (!absolute) |
| * push %xdi |
| * push %xsi |
| * endif |
| * push %xax |
| * if (absolute) |
| * # support for skipping the hook (note: 32-bits even on x64) |
| * RESTORE_FROM_UPCONTEXT ignore_enterexit_OFFSET,%edi |
| * cmp %edi,0 |
| * jnz post_hook |
| * endif |
| * # for x64 windows, reserve 32 bytes stack space for call prior to call |
| * call ENTER_DR_HOOK |
| * |
| * post_hook: |
| * pop %xax |
| * if (!absolute) |
| * pop %xsi |
| * pop %xdi |
| * endif |
| * pop %xcx |
| * endif |
| * |
| * # save last_exit, currently in eax, into dcontext->last_exit |
| * SAVE_TO_DCONTEXT %xax,last_exit_OFFSET |
| * |
| * .ifdef WINDOWS && CLIENT_INTERFACE |
| * swap_peb |
| * .endif |
| * |
| * .ifdef SIDELINE |
| * # clear cur-trace field so we don't think cur trace is still running |
| * movl $0, _sideline_trace |
| * .endif |
| * |
| * # call central dispatch routine w/ dcontext as an argument |
| * if (absolute) |
| * push <dcontext> |
| * else |
| * push %xdi # for x64, mov %xdi, ARG1 |
| * endif |
| * call dispatch # for x64 windows, reserve 32 bytes stack space for call |
| * # dispatch() shouldn't return! |
| * jmp unexpected_return |
| */ |
| |
| /* N.B.: this routine is used to generate both the regular fcache_return |
| * and a slightly different copy that is used for the miss/unlinked paths |
| * for indirect_branch_lookup for self-protection. |
| * ibl_end should be true only for that end of the lookup routine. |
| * |
| * If linkstub != NULL, used for coarse fragments, this routine assumes that: |
| * - app xax is still in %xax |
| * - next target pc is in DIRECT_STUB_SPILL_SLOT tls |
| * - linkstub is the linkstub_t to pass back to dispatch |
| * - if coarse_info: |
| * - app xcx is in MANGLE_XCX_SPILL_SLOT |
| * - source coarse info is in %xcx |
| * |
| * We assume this routine does not use TLS slot FLOAT_PC_STATE_SLOT (TLS_SLOT_REG1). |
| */ |
| bool |
| append_fcache_return_common(dcontext_t *dcontext, generated_code_t *code, |
| instrlist_t *ilist, bool ibl_end, |
| bool absolute, bool shared, linkstub_t *linkstub, |
| bool coarse_info) |
| { |
| bool instr_targets; |
| |
| /* no support for absolute addresses on x64: we always use tls */ |
| IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute && shared)); |
| |
| /* currently linkstub is only used for coarse-grain exits */ |
| ASSERT(linkstub == NULL || !absolute); |
| |
| append_prepare_fcache_return(dcontext, ilist, absolute, shared); |
| append_save_gpr(dcontext, ilist, ibl_end, absolute, code, linkstub, coarse_info); |
| append_save_simd_reg(dcontext, ilist, absolute); |
| |
| /* Switch to a clean dstack as part of our scheme to avoid state kept |
| * unprotected across cache executions. |
| * FIXME: this isn't perfect: we switch to the dstack BEFORE we call |
| * the entrance hook that will be used to coordinate other threads, |
| * so if our hook suspends all other threads to protect vs cross-thread |
| * attacks, the dstack is not perfectly protected. |
| */ |
| APP(ilist, RESTORE_FROM_DC(dcontext, REG_XSP, DSTACK_OFFSET)); |
| |
| append_save_clear_xflags(dcontext, ilist, absolute); |
| instr_targets = append_call_enter_dr_hook(dcontext, ilist, ibl_end, absolute); |
| |
| /* save last_exit, currently in scratch_reg0 into dcontext->last_exit */ |
| APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG0, LAST_EXIT_OFFSET)); |
| |
| #if defined(WINDOWS) && defined(CLIENT_INTERFACE) |
| /* i#249: isolate the PEB */ |
| if (INTERNAL_OPTION(private_peb) && should_swap_peb_pointer()) { |
| preinsert_swap_peb(dcontext, ilist, NULL, absolute, SCRATCH_REG5, |
| SCRATCH_REG0/*scratch*/, true/*to priv*/); |
| } |
| #endif /* WINDOWS && CLIENT_INTERFACE */ |
| |
| #ifdef SIDELINE |
| if (dynamo_options.sideline) { |
| /* clear cur-trace field so we don't think cur trace is still running */ |
| /* PR 248210: unsupported feature on x64 */ |
| IF_X64(ASSERT_NOT_IMPLEMENTED(false)); /* PR 244737: fix abs address */ |
| APP(ilist, |
| XINST_CREATE_store(dcontext, |
| OPND_CREATE_MEM32(REG_NULL, (int)&sideline_trace), |
| OPND_CREATE_INT32(0))); |
| } |
| #endif |
| |
| append_call_dispatch(dcontext, ilist, absolute); |
| return instr_targets; |
| } |
| |
| byte * |
| emit_fcache_return(dcontext_t *dcontext, generated_code_t *code, byte *pc) |
| { |
| bool instr_targets; |
| instrlist_t ilist; |
| instrlist_init(&ilist); |
| instr_targets = append_fcache_return_common(dcontext, code, &ilist, |
| false/*!ibl_end*/, |
| true/*absolute*/, false/*!shared*/, |
| NULL, false/*not coarse*/); |
| /* now encode the instructions */ |
| pc = instrlist_encode(dcontext, &ilist, pc, instr_targets); |
| ASSERT(pc != NULL); |
| /* free the instrlist_t elements */ |
| instrlist_clear(dcontext, &ilist); |
| return pc; |
| } |
| |
| byte * |
| emit_fcache_enter_shared(dcontext_t *dcontext, generated_code_t *code, byte *pc) |
| { |
| return emit_fcache_enter_common(dcontext, code, pc, |
| false/*through xdi*/, true/*shared*/); |
| } |
| |
| byte * |
| emit_fcache_return_shared(dcontext_t *dcontext, generated_code_t *code, byte *pc) |
| { |
| bool instr_targets; |
| instrlist_t ilist; |
| instrlist_init(&ilist); |
| instr_targets = append_fcache_return_common(dcontext, code, &ilist, false/*!ibl_end*/, |
| false/*through xdi*/, true/*shared*/, |
| NULL, false/*not coarse*/); |
| /* now encode the instructions */ |
| pc = instrlist_encode(dcontext, &ilist, pc, instr_targets); |
| ASSERT(pc != NULL); |
| /* free the instrlist_t elements */ |
| instrlist_clear(dcontext, &ilist); |
| return pc; |
| } |
| |
| byte * |
| emit_fcache_return_coarse(dcontext_t *dcontext, generated_code_t *code, byte *pc) |
| { |
| bool instr_targets; |
| linkstub_t *linkstub = (linkstub_t *) get_coarse_exit_linkstub(); |
| instrlist_t ilist; |
| instrlist_init(&ilist); |
| instr_targets = append_fcache_return_common(dcontext, code, &ilist, false/*!ibl_end*/, |
| false/*through xdi*/, true/*shared*/, |
| linkstub, true/*coarse info in xcx*/); |
| /* now encode the instructions */ |
| pc = instrlist_encode(dcontext, &ilist, pc, instr_targets); |
| ASSERT(pc != NULL); |
| /* free the instrlist_t elements */ |
| instrlist_clear(dcontext, &ilist); |
| return pc; |
| } |
| |
| byte * |
| emit_trace_head_return_coarse(dcontext_t *dcontext, generated_code_t *code, byte *pc) |
| { |
| /* Could share tail end of coarse_fcache_return instead of duplicating */ |
| bool instr_targets; |
| linkstub_t *linkstub = (linkstub_t *) get_coarse_trace_head_exit_linkstub(); |
| instrlist_t ilist; |
| instrlist_init(&ilist); |
| instr_targets = append_fcache_return_common(dcontext, code, &ilist, false/*!ibl_end*/, |
| false/*through xdi*/, true/*shared*/, |
| linkstub, false/*no coarse info*/); |
| /* now encode the instructions */ |
| pc = instrlist_encode(dcontext, &ilist, pc, instr_targets); |
| ASSERT(pc != NULL); |
| /* free the instrlist_t elements */ |
| instrlist_clear(dcontext, &ilist); |
| return pc; |
| } |
| |
| /* Our coarse entrance stubs have several advantages, such as eliminating |
| * future fragments, but their accompanying lazy linking does need source |
| * information that is not available in each stub. We instead have an |
| * unlinked entrance stub target a per-unit prefix that records the source |
| * unit. We can then search within the unit to identify the actual source |
| * entrance stub, which is enough for lazy linking (but does not find the |
| * unique source tag: case 8565). This also gives us a single indirection |
| * point in the form of the prefix at which to patch the fcache_return target. |
| * We also place in the prefix indirection points for trace head cache exit and |
| * the 3 coarse ibl targets, to keep the cache read-only and (again) make it |
| * easier to patch when persisting/sharing. |
| */ |
| uint |
| coarse_exit_prefix_size(coarse_info_t *info) |
| { |
| #ifdef X64 |
| uint flags = COARSE_32_FLAG(info); |
| #endif |
| /* FIXME: would be nice to use size calculated in emit_coarse_exit_prefix(), |
| * but we need to know size before we emit and would have to do a throwaway |
| * emit, or else set up a template to be patched w/ specific info field. |
| * Also we'd have to unprot .data as we don't access this until post-init. |
| */ |
| /* We don't need to require addr16: in fact it might be better to force |
| * not using it, so if we persist on P4 but run on Core we don't lose |
| * performance. We have enough space. |
| */ |
| return SIZE_MOV_XBX_TO_TLS(flags, false) + SIZE_MOV_PTR_IMM_TO_XAX(flags) |
| + 5*JMP_LONG_LENGTH; |
| } |
| |
| byte * |
| emit_coarse_exit_prefix(dcontext_t *dcontext, byte *pc, coarse_info_t *info) |
| { |
| byte *ibl; |
| DEBUG_DECLARE(byte *start_pc = pc;) |
| instrlist_t ilist; |
| patch_list_t patch; |
| instr_t *fcache_ret_prefix; |
| #ifdef X64 |
| gencode_mode_t mode = FRAGMENT_GENCODE_MODE(COARSE_32_FLAG(info)); |
| #endif |
| |
| instrlist_init(&ilist); |
| init_patch_list(&patch, PATCH_TYPE_INDIRECT_FS); |
| |
| /* prefix looks like this, using xcx instead of xbx just to make |
| * the fcache_return code simpler (as it already uses xbx early), |
| * and using the info as we're doing per-cache and not per-unit: |
| * |
| * fcache_return_coarse_prefix: |
| * 6/9 mov %xcx, MANGLE_XCX_SPILL_SLOT |
| * 5/10 mov <info ptr>, %xcx |
| * 5 jmp fcache_return_coarse |
| * trace_head_return_coarse_prefix: |
| * 5 jmp trace_head_return_coarse |
| * (if -disable_traces, it jmps to fcache_return_coarse_prefix instead) |
| * coarse_ibl_ret_prefix: |
| * 5 jmp coarse_ibl_ret |
| * coarse_ibl_call_prefix: |
| * 5 jmp coarse_ibl_call |
| * coarse_ibl_jmp_prefix: |
| * 5 jmp coarse_ibl_jmp |
| * |
| * We assume that info ptr is at |
| * trace_head_return_prefix - JMP_LONG_LENGTH - 4 |
| * in patch_coarse_exit_prefix(). |
| * We assume that the ibl prefixes are nothing but jmps in |
| * coarse_indirect_stub_jmp_target() so we can recover the ibl type. |
| * |
| * FIXME case 9647: on P4 our jmp->jmp sequence will be |
| * elided, but on Core we may want to switch to a jmp*, though |
| * since we have no register for a base ptr we'd need a reloc |
| * entry for every single stub |
| */ |
| /* entrance stub has put target_tag into xax-slot so we use xcx-slot */ |
| ASSERT(DIRECT_STUB_SPILL_SLOT != MANGLE_XCX_SPILL_SLOT); |
| |
| fcache_ret_prefix = INSTR_CREATE_label(dcontext); |
| APP(&ilist, fcache_ret_prefix); |
| |
| #ifdef X64 |
| if (TEST(PERSCACHE_X86_32, info->flags)) { |
| /* XXX: this won't work b/c opnd size will be wrong */ |
| ASSERT_NOT_IMPLEMENTED(false && "must pass opnd size to SAVE_TO_TLS"); |
| APP(&ilist, SAVE_TO_TLS(dcontext, REG_ECX, MANGLE_XCX_SPILL_SLOT)); |
| /* We assume all our data structures are <4GB which is guaranteed for |
| * WOW64 processes. |
| */ |
| ASSERT(CHECK_TRUNCATE_TYPE_int((ptr_int_t)info)); |
| APP(&ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_ECX), |
| OPND_CREATE_INT32((int)(ptr_int_t)info))); |
| } else { /* default code */ |
| if (GENCODE_IS_X86_TO_X64(mode) && DYNAMO_OPTION(x86_to_x64_ibl_opt)) |
| APP(&ilist, SAVE_TO_REG(dcontext, SCRATCH_REG2, REG_R9)); |
| else |
| #endif |
| APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG2/*xcx/r2*/, |
| MANGLE_XCX_SPILL_SLOT)); |
| APP(&ilist, XINST_CREATE_load_int(dcontext, |
| opnd_create_reg(SCRATCH_REG2/*xcx/r2*/), |
| OPND_CREATE_INTPTR((ptr_int_t)info))); |
| #ifdef X64 |
| } |
| #endif |
| APP(&ilist, XINST_CREATE_jump(dcontext, |
| opnd_create_pc(get_direct_exit_target(dcontext, |
| FRAG_SHARED | FRAG_COARSE_GRAIN | |
| COARSE_32_FLAG(info))))); |
| |
| APP(&ilist, INSTR_CREATE_label(dcontext)); |
| add_patch_marker(&patch, instrlist_last(&ilist), PATCH_ASSEMBLE_ABSOLUTE, |
| 0 /* start of instr */, |
| (ptr_uint_t*)&info->trace_head_return_prefix); |
| if (DYNAMO_OPTION(disable_traces) || |
| /* i#670: the stub stored the abs addr at persist time. we need |
| * to adjust to the use-time mod base which we do in dispatch |
| * but we need to set the dcontext->coarse_exit so we go through |
| * the fcache return |
| */ |
| (info->frozen && info->mod_shift != 0)) { |
| /* trace_t heads need to store the info ptr for lazy linking */ |
| APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_instr(fcache_ret_prefix))); |
| } else { |
| APP(&ilist, XINST_CREATE_jump |
| (dcontext, opnd_create_pc(trace_head_return_coarse_routine(IF_X64(mode))))); |
| } |
| |
| /* coarse does not support IBL_FAR so we don't bother with get_ibl_entry_type() */ |
| ibl = get_ibl_routine_ex(dcontext, IBL_LINKED, |
| get_source_fragment_type(dcontext, |
| FRAG_SHARED | FRAG_COARSE_GRAIN), |
| IBL_RETURN _IF_X64(mode)); |
| APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_pc(ibl))); |
| add_patch_marker(&patch, instrlist_last(&ilist), PATCH_ASSEMBLE_ABSOLUTE, |
| 0 /* start of instr */, (ptr_uint_t*)&info->ibl_ret_prefix); |
| |
| ibl = get_ibl_routine_ex(dcontext, IBL_LINKED, |
| get_source_fragment_type(dcontext, |
| FRAG_SHARED | FRAG_COARSE_GRAIN), |
| IBL_INDCALL _IF_X64(mode)); |
| APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_pc(ibl))); |
| add_patch_marker(&patch, instrlist_last(&ilist), PATCH_ASSEMBLE_ABSOLUTE, |
| 0 /* start of instr */, (ptr_uint_t*)&info->ibl_call_prefix); |
| |
| ibl = get_ibl_routine_ex(dcontext, IBL_LINKED, |
| get_source_fragment_type(dcontext, |
| FRAG_SHARED | FRAG_COARSE_GRAIN), |
| IBL_INDJMP _IF_X64(mode)); |
| APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_pc(ibl))); |
| add_patch_marker(&patch, instrlist_last(&ilist), PATCH_ASSEMBLE_ABSOLUTE, |
| 0 /* start of instr */, (ptr_uint_t*)&info->ibl_jmp_prefix); |
| |
| /* now encode the instructions */ |
| pc += encode_with_patch_list(dcontext, &patch, &ilist, pc); |
| /* free the instrlist_t elements */ |
| instrlist_clear(dcontext, &ilist); |
| ASSERT((size_t)(pc - start_pc) == coarse_exit_prefix_size(info)); |
| |
| DOLOG(3, LOG_EMIT, { |
| byte *dpc = start_pc; |
| LOG(GLOBAL, LOG_EMIT, 3, "\nprefixes for coarse unit %s:\n", info->module); |
| do { |
| if (dpc == info->fcache_return_prefix) |
| LOG(GLOBAL, LOG_EMIT, 3, "fcache_return_coarse_prefix:\n"); |
| else if (dpc == info->trace_head_return_prefix) |
| LOG(GLOBAL, LOG_EMIT, 3, "trace_head_return_coarse_prefix:\n"); |
| else if (dpc == info->ibl_ret_prefix) |
| LOG(GLOBAL, LOG_EMIT, 3, "ibl_coarse_ret_prefix:\n"); |
| else if (dpc == info->ibl_call_prefix) |
| LOG(GLOBAL, LOG_EMIT, 3, "ibl_coarse_call_prefix:\n"); |
| else if (dpc == info->ibl_jmp_prefix) |
| LOG(GLOBAL, LOG_EMIT, 3, "ibl_coarse_jmp_prefix:\n"); |
| dpc = disassemble_with_bytes(dcontext, dpc, GLOBAL); |
| } while (dpc < pc); |
| LOG(GLOBAL, LOG_EMIT, 3, "\n"); |
| }); |
| |
| return pc; |
| } |
| |
| /* Update info pointer in exit prefixes */ |
| void |
| patch_coarse_exit_prefix(dcontext_t *dcontext, coarse_info_t *info) |
| { |
| ptr_uint_t *pc = (ptr_uint_t *) |
| (info->trace_head_return_prefix - JMP_LONG_LENGTH - sizeof(info)); |
| *pc = (ptr_uint_t) info; |
| } |
| |
| |
| #ifdef HASHTABLE_STATISTICS |
| /* note that arch_thread_init is called before fragment_thread_init, |
| * so these need to be updated |
| */ |
| /* When used in a thread-shared routine, this routine clobbers XDI. The |
| * caller should spill & restore it or rematerialize it as needed. */ |
| /* NOTE - this routine does NOT save the eflags, which will be clobbered by the |
| * inc */ |
| void |
| append_increment_counter(dcontext_t *dcontext, instrlist_t *ilist, |
| ibl_code_t *ibl_code, patch_list_t *patch, |
| reg_id_t entry_register, /* register indirect (XCX) or NULL */ |
| /* adjusted to unprot_ht_statistics_t if no entry_register */ |
| uint counter_offset, |
| reg_id_t scratch_register) |
| { |
| #ifdef X86 |
| instr_t *counter; |
| #endif |
| bool absolute = !ibl_code->thread_shared_routine; |
| /* no support for absolute addresses on x64: we always use tls/reg */ |
| IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute)); |
| |
| if (!INTERNAL_OPTION(hashtable_ibl_stats)) |
| return; |
| |
| LOG(THREAD, LOG_EMIT, 3, |
| "append_increment_counter: hashtable_stats_offset=0x%x counter_offset=0x%x\n", |
| ibl_code->hashtable_stats_offset, counter_offset); |
| |
| if (entry_register == REG_NULL) { |
| /* adjust offset within a unprot_ht_statistics_t structure */ |
| counter_offset += ibl_code->hashtable_stats_offset; |
| } |
| |
| if (!absolute) { |
| opnd_t counter_opnd; |
| |
| /* get dcontext in register (xdi) */ |
| insert_shared_get_dcontext(dcontext, ilist, NULL, false/* dead register */); |
| /* XDI now has dcontext */ |
| APP(ilist, XINST_CREATE_load(dcontext, |
| opnd_create_reg(SCRATCH_REG5/*xdi/r5*/), |
| OPND_DC_FIELD(absolute, dcontext, OPSZ_PTR, |
| FRAGMENT_FIELD_OFFSET))); |
| |
| /* XDI now has per_thread_t structure */ |
| /* an extra step here: find the unprot_stats field in the fragment_table_t |
| * could avoid for protect_mask==0 if we always had a copy |
| * in the per_thread_t struct -- see fragment.h, not worth it |
| */ |
| if (entry_register != REG_NULL) { |
| APP(ilist, XINST_CREATE_load |
| (dcontext, opnd_create_reg(SCRATCH_REG5/*xdi/r5*/), |
| OPND_CREATE_MEMPTR(SCRATCH_REG5/*xdi/r5*/, |
| ibl_code->entry_stats_to_lookup_table_offset))); |
| /* XDI should now have (entry_stats - lookup_table) value, |
| * so we need [xdi+xcx] to get an entry reference |
| */ |
| counter_opnd = opnd_create_base_disp(SCRATCH_REG5/*xdi/r5*/, |
| entry_register, 1, |
| counter_offset, OPSZ_4); |
| } else { |
| APP(ilist, XINST_CREATE_load |
| (dcontext, opnd_create_reg(SCRATCH_REG5/*xdi/r5*/), |
| OPND_CREATE_MEMPTR(SCRATCH_REG5/*xdi/r5*/, |
| ibl_code->unprot_stats_offset))); |
| /* XDI now has unprot_stats structure */ |
| counter_opnd = OPND_CREATE_MEM32(SCRATCH_REG5/*xdi/r5*/, counter_offset); |
| } |
| |
| #ifdef X86 |
| counter = INSTR_CREATE_inc(dcontext, counter_opnd); |
| APP(ilist, counter); |
| #elif defined(ARM) |
| /* FIXMED i#1551: NYI on ARM */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| #endif |
| } else { |
| #ifdef X86 |
| /* TAKE_ADDRESS will in fact add the necessary base to the statistics structure, |
| hence no explicit indirection needed here */ |
| opnd_t c
|