blob: 4d0777b090ab2b17523107be095119ae8d6565f7 [file] [log] [blame]
/* ******************************************************************************
* Copyright (c) 2010-2014 Google, Inc. All rights reserved.
* Copyright (c) 2010 Massachusetts Institute of Technology All rights reserved.
* Copyright (c) 2000-2010 VMware, Inc. All rights reserved.
* ******************************************************************************/
/*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of VMware, Inc. nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
/* Copyright (c) 2003-2007 Determina Corp. */
/* Copyright (c) 2001-2003 Massachusetts Institute of Technology */
/* Copyright (c) 2000-2001 Hewlett-Packard Company */
/* file "mangle.c" */
#include "../globals.h"
#include "../link.h"
#include "../fragment.h"
#include "arch.h"
#include "instr.h"
#include "instr_create.h"
#include "instrlist.h"
#include "decode.h"
#include "decode_fast.h"
#include "disassemble.h"
#include "../hashtable.h"
#include "../fcache.h" /* for in_fcache */
#ifdef STEAL_REGISTER
#include "steal_reg.h"
#endif
#include "instrument.h" /* for dr_insert_call */
#include "../translate.h"
#ifdef RCT_IND_BRANCH
# include "../rct.h" /* rct_add_rip_rel_addr */
#endif
#ifdef UNIX
#include <sys/syscall.h>
#endif
#include <string.h> /* for memset */
#ifdef ANNOTATIONS
# include "../annotations.h"
#endif
/* make code more readable by shortening long lines
* we mark everything we add as a meta-instr to avoid hitting
* client asserts on setting translation fields
*/
#define POST instrlist_meta_postinsert
#define PRE instrlist_meta_preinsert
/***************************************************************************/
/* Convert a short-format CTI into an equivalent one using
* near-rel-format.
* Remember, the target is kept in the 0th src array position,
* and has already been converted from an 8-bit offset to an
* absolute PC, so we can just pretend instructions are longer
* than they really are.
*/
static instr_t *
convert_to_near_rel_common(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr)
{
int opcode = instr_get_opcode(instr);
DEBUG_DECLARE(const instr_info_t * info = instr_get_instr_info(instr);)
app_pc target = NULL;
if (opcode == OP_jmp_short) {
instr_set_opcode(instr, OP_jmp);
return instr;
}
if (OP_jo_short <= opcode && opcode <= OP_jnle_short) {
/* WARNING! following is OP_ enum order specific */
instr_set_opcode(instr, opcode - OP_jo_short + OP_jo);
return instr;
}
if (OP_loopne <= opcode && opcode <= OP_jecxz) {
uint mangled_sz;
uint offs;
/*
* from "info as" on GNU/linux system:
Note that the `jcxz', `jecxz', `loop', `loopz', `loope', `loopnz'
and `loopne' instructions only come in byte displacements, so that if
you use these instructions (`gcc' does not use them) you may get an
error message (and incorrect code). The AT&T 80386 assembler tries to
get around this problem by expanding `jcxz foo' to
jcxz cx_zero
jmp cx_nonzero
cx_zero: jmp foo
cx_nonzero:
*
* We use that same expansion, but we want to treat the entire
* three-instruction sequence as a single conditional branch.
* Thus we use a special instruction that stores the entire
* instruction sequence as mangled bytes, yet w/ a valid target operand
* (xref PR 251646).
* patch_branch and instr_invert_cbr
* know how to find the target pc (final 4 of 9 bytes).
* When decoding anything we've written we know the only jcxz or
* loop* instructions are part of these rewritten packages, and
* we use remangle_short_rewrite to read back in the instr.
* (have to do this everywhere call decode() except original
* interp, plus in input_trace())
*
* An alternative is to change 'jcxz foo' to:
<save eflags>
cmpb %cx,$0
je foo_restore
<restore eflags>
...
foo_restore: <restore eflags>
foo:
* However the added complications of restoring the eflags on
* the taken-branch path made me choose the former solution.
*/
/* SUMMARY:
* expand 'shortjump foo' to:
shortjump taken
jmp-short nottaken
taken: jmp foo
nottaken:
*/
if (ilist != NULL) {
/* PR 266292: for meta instrs, insert separate instrs */
/* reverse order */
opnd_t tgt = instr_get_target(instr);
instr_t *nottaken = INSTR_CREATE_label(dcontext);
instr_t *taken = INSTR_CREATE_jmp(dcontext, tgt);
ASSERT(instr_is_meta(instr));
instrlist_meta_postinsert(ilist, instr, nottaken);
instrlist_meta_postinsert(ilist, instr, taken);
instrlist_meta_postinsert(ilist, instr, INSTR_CREATE_jmp_short
(dcontext, opnd_create_instr(nottaken)));
instr_set_target(instr, opnd_create_instr(taken));
return taken;
}
if (opnd_is_near_pc(instr_get_target(instr)))
target = opnd_get_pc(instr_get_target(instr));
else if (opnd_is_near_instr(instr_get_target(instr))) {
instr_t *tgt = opnd_get_instr(instr_get_target(instr));
/* assumption: target's translation or raw bits are set properly */
target = instr_get_translation(tgt);
if (target == NULL && instr_raw_bits_valid(tgt))
target = instr_get_raw_bits(tgt);
ASSERT(target != NULL);
} else
ASSERT_NOT_REACHED();
/* PR 251646: cti_short_rewrite: target is in src0, so operands are
* valid, but raw bits must also be valid, since they hide the multiple
* instrs. For x64, it is marked for re-relativization, but it's
* special since the target must be obtained from src0 and not
* from the raw bits (since that might not reach).
*/
/* need 9 bytes + possible addr prefix */
mangled_sz = CTI_SHORT_REWRITE_LENGTH;
if (!reg_is_pointer_sized(opnd_get_reg(instr_get_src(instr, 1))))
mangled_sz++; /* need addr prefix */
instr_allocate_raw_bits(dcontext, instr, mangled_sz);
offs = 0;
if (mangled_sz > CTI_SHORT_REWRITE_LENGTH) {
instr_set_raw_byte(instr, offs, ADDR_PREFIX_OPCODE);
offs++;
}
/* first 2 bytes: jecxz 8-bit-offset */
instr_set_raw_byte(instr, offs, decode_first_opcode_byte(opcode));
offs++;
/* remember pc-relative offsets are from start of next instr */
instr_set_raw_byte(instr, offs, (byte)2);
offs++;
/* next 2 bytes: jmp-short 8-bit-offset */
instr_set_raw_byte(instr, offs, decode_first_opcode_byte(OP_jmp_short));
offs++;
instr_set_raw_byte(instr, offs, (byte)5);
offs++;
/* next 5 bytes: jmp 32-bit-offset */
instr_set_raw_byte(instr, offs, decode_first_opcode_byte(OP_jmp));
offs++;
/* for x64 we may not reach, but we go ahead and try */
instr_set_raw_word(instr, offs, (int)
(target - (instr->bytes + mangled_sz)));
offs += sizeof(int);
ASSERT(offs == mangled_sz);
LOG(THREAD, LOG_INTERP, 2, "convert_to_near_rel: jecxz/loop* opcode\n");
/* original target operand is still valid */
instr_set_operands_valid(instr, true);
return instr;
}
LOG(THREAD, LOG_INTERP, 1, "convert_to_near_rel: unknown opcode: %d %s\n",
opcode, info->name);
ASSERT_NOT_REACHED(); /* conversion not possible OR not a short-form cti */
return instr;
}
instr_t *
convert_to_near_rel_meta(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr)
{
return convert_to_near_rel_common(dcontext, ilist, instr);
}
void
convert_to_near_rel(dcontext_t *dcontext, instr_t *instr)
{
convert_to_near_rel_common(dcontext, NULL, instr);
}
/* For jecxz and loop*, we create 3 instructions in a single
* instr that we treat like a single conditional branch.
* On re-decoding our own output we need to recreate that instr.
* This routine assumes that the instructions encoded at pc
* are indeed a mangled cti short.
* Assumes that the first instr has already been decoded into instr,
* that pc points to the start of that instr.
* Converts instr into a new 3-raw-byte-instr with a private copy of the
* original raw bits.
* Optionally modifies the target to "target" if "target" is non-null.
* Returns the pc of the instruction after the remangled sequence.
*/
byte *
remangle_short_rewrite(dcontext_t *dcontext,
instr_t *instr, byte *pc, app_pc target)
{
uint mangled_sz = CTI_SHORT_REWRITE_LENGTH;
ASSERT(instr_is_cti_short_rewrite(instr, pc));
if (*pc == ADDR_PREFIX_OPCODE)
mangled_sz++;
/* first set the target in the actual operand src0 */
if (target == NULL) {
/* acquire existing absolute target */
int rel_target = *((int *)(pc + mangled_sz - 4));
target = pc + mangled_sz + rel_target;
}
instr_set_target(instr, opnd_create_pc(target));
/* now set up the bundle of raw instructions
* we've already read the first 2-byte instruction, jecxz/loop*
* they all take up mangled_sz bytes
*/
instr_allocate_raw_bits(dcontext, instr, mangled_sz);
instr_set_raw_bytes(instr, pc, mangled_sz);
/* for x64 we may not reach, but we go ahead and try */
instr_set_raw_word(instr, mangled_sz - 4, (int)(target - (pc + mangled_sz)));
/* now make operands valid */
instr_set_operands_valid(instr, true);
return (pc+mangled_sz);
}
/***************************************************************************/
#if !defined(STANDALONE_DECODER)
int
insert_out_of_line_context_switch(dcontext_t *dcontext, instrlist_t *ilist,
instr_t *instr, bool save)
{
if (save) {
/* We adjust the stack so the return address will not be clobbered,
* so we can have call/return pair to take advantage of hardware
* call return stack for better performance.
* xref emit_clean_call_save @ x86/emit_utils.c
*/
PRE(ilist, instr,
INSTR_CREATE_lea
(dcontext,
opnd_create_reg(DR_REG_XSP),
opnd_create_base_disp(DR_REG_XSP, DR_REG_NULL, 0,
-(int)(get_clean_call_switch_stack_size() +
get_clean_call_temp_stack_size()),
OPSZ_lea)));
}
PRE(ilist, instr,
INSTR_CREATE_call
(dcontext, save ?
opnd_create_pc(get_clean_call_save(dcontext _IF_X64(GENCODE_X64))) :
opnd_create_pc(get_clean_call_restore(dcontext _IF_X64(GENCODE_X64)))));
return get_clean_call_switch_stack_size();
}
void
insert_clear_eflags(dcontext_t *dcontext, clean_call_info_t *cci,
instrlist_t *ilist, instr_t *instr)
{
/* clear eflags for callee's usage */
if (cci == NULL || !cci->skip_clear_eflags) {
if (dynamo_options.cleancall_ignore_eflags) {
/* we still clear DF since some compiler assumes
* DF is cleared at each function.
*/
PRE(ilist, instr, INSTR_CREATE_cld(dcontext));
} else {
/* on x64 a push immed is sign-extended to 64-bit */
PRE(ilist, instr,
INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(0)));
PRE(ilist, instr, INSTR_CREATE_popf(dcontext));
}
}
}
/* Pushes not only the GPRs but also xmm/ymm, xip, and xflags, in
* priv_mcontext_t order.
* The current stack pointer alignment should be passed. Use 1 if
* unknown (NOT 0).
* Returns the amount of data pushed. Does NOT fix up the xsp value pushed
* to be the value prior to any pushes for x64 as no caller needs that
* currently (they all build a priv_mcontext_t and have to do further xsp
* fixups anyway).
* Includes xmm0-5 for PR 264138.
*/
uint
insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
instrlist_t *ilist, instr_t *instr,
uint alignment, instr_t *push_pc)
{
uint dstack_offs = 0;
int offs_beyond_xmm = 0;
if (cci == NULL)
cci = &default_clean_call_info;
if (cci->preserve_mcontext || cci->num_xmms_skip != NUM_XMM_REGS) {
int offs = XMM_SLOTS_SIZE + PRE_XMM_PADDING;
if (cci->preserve_mcontext && cci->skip_save_aflags) {
offs_beyond_xmm = 2*XSP_SZ; /* pc and flags */
offs += offs_beyond_xmm;
}
PRE(ilist, instr, INSTR_CREATE_lea
(dcontext, opnd_create_reg(REG_XSP),
OPND_CREATE_MEM_lea(REG_XSP, REG_NULL, 0, -offs)));
dstack_offs += offs;
}
if (preserve_xmm_caller_saved()) {
/* PR 264138: we must preserve xmm0-5 if on a 64-bit kernel */
int i;
/* PR 266305: see discussion in emit_fcache_enter_shared on
* which opcode is better. Note that the AMD optimization
* guide says to use movlps+movhps for unaligned stores, but
* for simplicity and smaller code I'm using movups anyway.
*/
/* XXX i#438: once have SandyBridge processor need to measure
* cost of vmovdqu and whether worth arranging 32-byte alignment
* for all callers. B/c we put ymm at end of priv_mcontext_t, we do
* currently have 32-byte alignment for clean calls.
*/
uint opcode = move_mm_reg_opcode(ALIGNED(alignment, 16), ALIGNED(alignment, 32));
ASSERT(proc_has_feature(FEATURE_SSE));
for (i=0; i<NUM_XMM_SAVED; i++) {
if (!cci->xmm_skip[i]) {
PRE(ilist, instr, instr_create_1dst_1src
(dcontext, opcode,
opnd_create_base_disp(REG_XSP, REG_NULL, 0,
PRE_XMM_PADDING + i*XMM_SAVED_REG_SIZE +
offs_beyond_xmm,
OPSZ_SAVED_XMM),
opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i)));
}
}
ASSERT(i*XMM_SAVED_REG_SIZE == XMM_SAVED_SIZE);
ASSERT(XMM_SAVED_SIZE <= XMM_SLOTS_SIZE);
}
/* pc and aflags */
if (!cci->skip_save_aflags) {
ASSERT(offs_beyond_xmm == 0);
PRE(ilist, instr, push_pc);
dstack_offs += XSP_SZ;
PRE(ilist, instr, INSTR_CREATE_pushf(dcontext));
dstack_offs += XSP_SZ;
} else {
ASSERT(offs_beyond_xmm == 2*XSP_SZ || !cci->preserve_mcontext);
/* for cci->preserve_mcontext we added to the lea above */
instr_destroy(dcontext, push_pc);
}
#ifdef X64
/* keep priv_mcontext_t order */
if (!cci->reg_skip[REG_R15 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_R15)));
if (!cci->reg_skip[REG_R14 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_R14)));
if (!cci->reg_skip[REG_R13 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_R13)));
if (!cci->reg_skip[REG_R12 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_R12)));
if (!cci->reg_skip[REG_R11 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_R11)));
if (!cci->reg_skip[REG_R10 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_R10)));
if (!cci->reg_skip[REG_R9 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_R9)));
if (!cci->reg_skip[REG_R8 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_R8)));
if (!cci->reg_skip[REG_RAX - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_RAX)));
if (!cci->reg_skip[REG_RCX - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_RCX)));
if (!cci->reg_skip[REG_RDX - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_RDX)));
if (!cci->reg_skip[REG_RBX - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_RBX)));
/* we do NOT match pusha xsp value */
if (!cci->reg_skip[REG_RSP - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_RSP)));
if (!cci->reg_skip[REG_RBP - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_RBP)));
if (!cci->reg_skip[REG_RSI - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_RSI)));
if (!cci->reg_skip[REG_RDI - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_RDI)));
dstack_offs += (NUM_GP_REGS - cci->num_regs_skip) * XSP_SZ;
#else
PRE(ilist, instr, INSTR_CREATE_pusha(dcontext));
dstack_offs += 8 * XSP_SZ;
#endif
ASSERT(cci->skip_save_aflags ||
cci->num_xmms_skip != 0 ||
cci->num_regs_skip != 0 ||
dstack_offs == (uint)get_clean_call_switch_stack_size());
return dstack_offs;
}
/* User should pass the alignment from insert_push_all_registers: i.e., the
* alignment at the end of all the popping, not the alignment prior to
* the popping.
*/
void
insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
instrlist_t *ilist, instr_t *instr,
uint alignment)
{
int offs_beyond_xmm = 0;
if (cci == NULL)
cci = &default_clean_call_info;
#ifdef X64
/* in priv_mcontext_t order */
if (!cci->reg_skip[REG_RDI - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_RDI)));
if (!cci->reg_skip[REG_RSI - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_RSI)));
if (!cci->reg_skip[REG_RBP - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_RBP)));
/* skip xsp by popping into dead rbx */
if (!cci->reg_skip[REG_RSP - REG_XAX]) {
ASSERT(!cci->reg_skip[REG_RBX - REG_XAX]);
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_RBX)));
}
if (!cci->reg_skip[REG_RBX - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_RBX)));
if (!cci->reg_skip[REG_RDX - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_RDX)));
if (!cci->reg_skip[REG_RCX - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_RCX)));
if (!cci->reg_skip[REG_RAX - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_RAX)));
if (!cci->reg_skip[REG_R8 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_R8)));
if (!cci->reg_skip[REG_R9 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_R9)));
if (!cci->reg_skip[REG_R10 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_R10)));
if (!cci->reg_skip[REG_R11 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_R11)));
if (!cci->reg_skip[REG_R12 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_R12)));
if (!cci->reg_skip[REG_R13 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_R13)));
if (!cci->reg_skip[REG_R14 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_R14)));
if (!cci->reg_skip[REG_R15 - REG_XAX])
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_R15)));
#else
PRE(ilist, instr, INSTR_CREATE_popa(dcontext));
#endif
if (!cci->skip_save_aflags) {
PRE(ilist, instr, INSTR_CREATE_popf(dcontext));
offs_beyond_xmm = XSP_SZ; /* pc */;
} else if (cci->preserve_mcontext) {
offs_beyond_xmm = 2*XSP_SZ; /* aflags + pc */
}
if (preserve_xmm_caller_saved()) {
/* PR 264138: we must preserve xmm0-5 if on a 64-bit kernel */
int i;
/* See discussion in emit_fcache_enter_shared on which opcode
* is better. */
uint opcode = move_mm_reg_opcode(ALIGNED(alignment, 32), ALIGNED(alignment, 16));
ASSERT(proc_has_feature(FEATURE_SSE));
for (i=0; i<NUM_XMM_SAVED; i++) {
if (!cci->xmm_skip[i]) {
PRE(ilist, instr, instr_create_1dst_1src
(dcontext, opcode, opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i),
opnd_create_base_disp(REG_XSP, REG_NULL, 0,
PRE_XMM_PADDING + i*XMM_SAVED_REG_SIZE +
offs_beyond_xmm,
OPSZ_SAVED_XMM)));
}
}
ASSERT(i*XMM_SAVED_REG_SIZE == XMM_SAVED_SIZE);
ASSERT(XMM_SAVED_SIZE <= XMM_SLOTS_SIZE);
}
PRE(ilist, instr, INSTR_CREATE_lea
(dcontext, opnd_create_reg(REG_XSP),
OPND_CREATE_MEM_lea(REG_XSP, REG_NULL, 0,
PRE_XMM_PADDING + XMM_SLOTS_SIZE +
offs_beyond_xmm)));
}
reg_id_t
shrink_reg_for_param(reg_id_t regular, opnd_t arg)
{
#ifdef X64
if (opnd_get_size(arg) == OPSZ_4) { /* we ignore var-sized */
/* PR 250976 #2: leave 64-bit only if an immed w/ top bit set (we
* assume user wants sign-extension; that is after all what happens
* on a push of a 32-bit immed) */
if (!opnd_is_immed_int(arg) ||
(opnd_get_immed_int(arg) & 0x80000000) == 0)
return reg_64_to_32(regular);
}
#endif
return regular;
}
/* Returns the change in the stack pointer.
* N.B.: due to stack alignment and minimum stack reservation, do
* not use parameters involving esp/rsp, as its value can change!
*
* This routine only supports passing arguments that are integers or
* pointers of a size equal or smaller than the register size: i.e., no
* floating-point, multimedia, or aggregate data types.
*
* For 64-bit mode, if a 32-bit immediate integer is specified as an
* argument and it has its top bit set, we assume it is intended to be
* sign-extended to 64-bits; otherwise we zero-extend it.
*
* For 64-bit mode, variable-sized argument operands may not work
* properly.
*
* Arguments that reference REG_XSP will work for clean calls, but are not guaranteed
* to work for non-clean, especially for 64-bit where we align, etc. Arguments that
* reference sub-register portions of REG_XSP are not supported.
*
* XXX PR 307874: w/ a post optimization pass, or perhaps more clever use of
* existing passes, we could do much better on calling convention and xsp conflicting
* args. We should also really consider inlining client callees (PR 218907), since
* clean calls for 64-bit are enormous (71 instrs/264 bytes for 2-arg x64; 26
* instrs/99 bytes for x86) and we could avoid all the xmm saves and replace pushf w/
* lahf.
*/
uint
insert_parameter_preparation(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
bool clean_call, uint num_args, opnd_t *args)
{
uint i;
int r;
uint preparm_padding = 0;
uint param_stack = 0, total_stack = 0;
bool push = true;
bool restore_xax = false;
bool restore_xsp = false;
/* we need two passes for PR 250976 optimization */
/* Push/mov in reverse order. We need a label so we can also add
* instrs prior to the regular param prep. So params are POST-mark, while
* pre-param-prep is POST-prev or PRE-mark.
*/
#ifdef X64
uint arg_pre_push = 0, total_pre_push = 0;
#endif
instr_t *prev = (instr == NULL) ? instrlist_last(ilist) : instr_get_prev(instr);
instr_t *mark = INSTR_CREATE_label(dcontext);
PRE(ilist, instr, mark);
/* For a clean call, xax is dead (clobbered by prepare_for_clean_call()).
* Rather than use as scratch and restore prior to each param that uses it,
* we restore once up front if any use it, and use regparms[0] as scratch,
* which is symmetric with non-clean-calls: regparms[0] is dead since we're
* doing args in reverse order. However, we then can't use regparms[0]
* directly if referenced in earlier params, but similarly for xax, so
* there's no clear better way. (prepare_for_clean_call also clobbers xsp,
* but we just disallow args that use it).
*/
ASSERT(num_args == 0 || args != NULL);
/* We can get away w/ one pass, except for PR 250976 we want calling conv
* regs to be able to refer to priv_mcontext_t as well as potentially being
* pushed: but we need to know the total # pushes ahead of time (since hard
* to mark for post-patching)
*/
for (i = 0; i < num_args; i++) {
IF_X64(bool is_pre_push = false;)
for (r = 0; r < opnd_num_regs_used(args[i]); r++) {
reg_id_t used = opnd_get_reg_used(args[i], r);
IF_X64(int parm;)
LOG(THREAD, LOG_INTERP, 4,
"ipp: considering arg %d reg %d == %s\n", i, r, reg_names[used]);
if (clean_call && !restore_xax && reg_overlap(used, REG_XAX))
restore_xax = true;
if (reg_overlap(used, REG_XSP)) {
IF_X64(CLIENT_ASSERT(clean_call,
"Non-clean-call argument: REG_XSP not supported"));
CLIENT_ASSERT(used == REG_XSP,
"Call argument: sub-reg-xsp not supported");
if (clean_call && /*x64*/parameters_stack_padded() && !restore_xsp)
restore_xsp = true;
}
#ifdef X64
/* PR 250976 #A: count the number of pre-pushes we need */
parm = reg_parameter_num(used);
/* We can read a register used in an earlier arg since we store that
* arg later (we do reverse order), except arg0, which we use as
* scratch (we don't always need it, but not worth another pre-pass
* through all args to find out), and xsp. Otherwise, if a plain reg,
* we point at mcontext (we restore xsp slot in mcontext if nec.).
* If a mem ref, we need to pre-push onto stack.
* N.B.: this conditional is duplicated in 2nd loop.
*/
if (!is_pre_push &&
((parm == 0 && num_args > 1) || parm > (int)i ||
reg_overlap(used, REG_XSP)) &&
(!clean_call || !opnd_is_reg(args[i]))) {
total_pre_push++;
is_pre_push = true; /* ignore further regs in same arg */
}
#endif
}
}
if (parameters_stack_padded()) {
/* For x64, supposed to reserve rsp space in function prologue; we
* do next best thing and reserve it prior to setting up the args.
*/
push = false; /* store args to xsp offsets instead of pushing them */
total_stack = REGPARM_MINSTACK;
if (num_args > NUM_REGPARM)
total_stack += XSP_SZ * (num_args - NUM_REGPARM);
param_stack = total_stack;
IF_X64(total_stack += XSP_SZ * total_pre_push);
/* We assume rsp is currently 16-byte aligned. End of arguments is supposed
* to be 16-byte aligned for x64 SysV (note that retaddr will then make
* rsp 8-byte-aligned, which is ok: callee has to rectify that).
* For clean calls, prepare_for_clean_call leaves rsp aligned for x64.
* XXX PR 218790: we require users of dr_insert_call to ensure
* alignment; should we put in support to dynamically align?
*/
preparm_padding =
ALIGN_FORWARD_UINT(total_stack, REGPARM_END_ALIGN) - total_stack;
total_stack += preparm_padding;
/* we have to wait to insert the xsp adjust */
} else {
ASSERT(NUM_REGPARM == 0);
ASSERT(push);
IF_X64(ASSERT(total_pre_push == 0));
total_stack = XSP_SZ * num_args;
}
LOG(THREAD, LOG_INTERP, 3,
"insert_parameter_preparation: %d args, %d in-reg, %d pre-push, %d/%d stack\n",
num_args, NUM_REGPARM, IF_X64_ELSE(total_pre_push, 0), param_stack, total_stack);
for (i = 0; i < num_args; i++) {
/* FIXME PR 302951: we need to handle state restoration if any
* of these args references app memory. We should pull the state from
* the priv_mcontext_t on the stack if in a clean call. FIXME: what if not?
*/
opnd_t arg = args[i];
CLIENT_ASSERT(opnd_get_size(arg) == OPSZ_PTR || opnd_is_immed_int(arg)
IF_X64(|| opnd_get_size(arg) == OPSZ_4),
"Clean call arg has unsupported size");
#ifdef X64
/* PR 250976 #A: support args that reference param regs */
for (r = 0; r < opnd_num_regs_used(arg); r++) {
reg_id_t used = opnd_get_reg_used(arg, r);
int parm = reg_parameter_num(used);
/* See comments in loop above */
if ((parm == 0 && num_args > 1) || parm > (int)i ||
reg_overlap(used, REG_XSP)) {
int disp = 0;
if (clean_call && opnd_is_reg(arg)) {
/* We can point at the priv_mcontext_t slot.
* priv_mcontext_t is at the base of dstack: compute offset
* from xsp to the field we want and replace arg.
*/
disp += opnd_get_reg_dcontext_offs(opnd_get_reg(arg));
/* skip rest of what prepare_for_clean_call adds */
disp += clean_call_beyond_mcontext();
/* skip what this routine added */
disp += total_stack;
} else {
/* Push a temp on the stack and point at it. We
* could try to optimize by juggling registers, but
* not worth it.
*/
/* xsp was adjusted up above; we simply store to xsp offsets */
disp = param_stack + XSP_SZ * arg_pre_push;
if (opnd_is_reg(arg) && opnd_get_size(arg) == OPSZ_PTR) {
POST(ilist, prev, INSTR_CREATE_mov_st
(dcontext, OPND_CREATE_MEMPTR(REG_XSP, disp), arg));
} else {
reg_id_t xsp_scratch = regparms[0];
/* don't want to just change size since will read extra bytes.
* can't do mem-to-mem so go through scratch reg */
if (reg_overlap(used, REG_XSP)) {
/* Get original xsp into scratch[0] and replace in arg */
if (opnd_uses_reg(arg, regparms[0])) {
xsp_scratch = REG_XAX;
ASSERT(!opnd_uses_reg(arg, REG_XAX)); /* can't use 3 */
/* FIXME: rather than putting xsp into mcontext
* slot, better to just do local get from dcontext
* like we do for 32-bit below? */
POST(ilist, prev, instr_create_restore_from_tls
(dcontext, REG_XAX, TLS_XAX_SLOT));
}
opnd_replace_reg(&arg, REG_XSP, xsp_scratch);
}
POST(ilist, prev,
INSTR_CREATE_mov_st(dcontext,
OPND_CREATE_MEMPTR(REG_XSP, disp),
opnd_create_reg(regparms[0])));
/* If sub-ptr-size, zero-extend is what we want so no movsxd */
POST(ilist, prev, INSTR_CREATE_mov_ld
(dcontext, opnd_create_reg
(shrink_reg_for_param(regparms[0], arg)), arg));
if (reg_overlap(used, REG_XSP)) {
int xsp_disp = opnd_get_reg_dcontext_offs(REG_XSP) +
clean_call_beyond_mcontext() + total_stack;
POST(ilist, prev, INSTR_CREATE_mov_ld
(dcontext, opnd_create_reg(xsp_scratch),
OPND_CREATE_MEMPTR(REG_XSP, xsp_disp)));
if (xsp_scratch == REG_XAX) {
POST(ilist, prev, instr_create_save_to_tls
(dcontext, REG_XAX, TLS_XAX_SLOT));
}
}
if (opnd_uses_reg(arg, regparms[0])) {
/* must restore since earlier arg might have clobbered */
int mc_disp = opnd_get_reg_dcontext_offs(regparms[0]) +
clean_call_beyond_mcontext() + total_stack;
POST(ilist, prev, INSTR_CREATE_mov_ld
(dcontext, opnd_create_reg(regparms[0]),
OPND_CREATE_MEMPTR(REG_XSP, mc_disp)));
}
}
arg_pre_push++; /* running counter */
}
arg = opnd_create_base_disp(REG_XSP, REG_NULL, 0,
disp, opnd_get_size(arg));
break; /* once we've handled arg ignore futher reg refs */
}
}
#endif
if (i < NUM_REGPARM) {
reg_id_t regparm = shrink_reg_for_param(regparms[i], arg);
if (opnd_is_immed_int(arg) || opnd_is_instr(arg)) {
POST(ilist, mark,
INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(regparm), arg));
} else {
POST(ilist, mark,
INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(regparm), arg));
}
} else {
if (push) {
IF_X64(ASSERT_NOT_REACHED()); /* no 64-bit push_imm! */
if (opnd_is_immed_int(arg) || opnd_is_instr(arg))
POST(ilist, mark, INSTR_CREATE_push_imm(dcontext, arg));
else {
if (clean_call && opnd_uses_reg(arg, REG_XSP)) {
/* We do a purely local expansion:
* spill eax, mc->eax, esp->eax, arg->eax, push eax, restore eax
*/
reg_id_t scratch = REG_XAX;
if (opnd_uses_reg(arg, scratch)) {
scratch = REG_XCX;
ASSERT(!opnd_uses_reg(arg, scratch)); /* can't use 3 regs */
}
opnd_replace_reg(&arg, REG_XSP, scratch);
POST(ilist, mark, instr_create_restore_from_tls
(dcontext, scratch, TLS_XAX_SLOT));
POST(ilist, mark, INSTR_CREATE_push(dcontext, arg));
POST(ilist, mark, instr_create_restore_from_dc_via_reg
(dcontext, scratch, scratch, XSP_OFFSET));
insert_get_mcontext_base
(dcontext, ilist, instr_get_next(mark), scratch);
POST(ilist, mark, instr_create_save_to_tls
(dcontext, scratch, TLS_XAX_SLOT));
} else
POST(ilist, mark, INSTR_CREATE_push(dcontext, arg));
}
} else {
/* xsp was adjusted up above; we simply store to xsp offsets */
uint offs = REGPARM_MINSTACK + XSP_SZ * (i - NUM_REGPARM);
#ifdef X64
if (opnd_is_immed_int(arg) || opnd_is_instr(arg)) {
/* PR 250976 #3: there is no memory store of 64-bit-immediate,
* so go through scratch reg */
ASSERT(NUM_REGPARM > 0);
POST(ilist, mark,
INSTR_CREATE_mov_st(dcontext,
OPND_CREATE_MEMPTR(REG_XSP, offs),
opnd_create_reg(regparms[0])));
POST(ilist, mark,
INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(regparms[0]),
arg));
} else {
#endif
if (opnd_is_memory_reference(arg)) {
/* can't do mem-to-mem so go through scratch */
reg_id_t scratch;
if (NUM_REGPARM > 0)
scratch = regparms[0];
else {
/* This happens on Mac.
* FIXME i#1370: not safe if later arg uses xax:
* local spill? Review how regparms[0] is preserved.
*/
scratch = REG_XAX;
}
POST(ilist, mark,
INSTR_CREATE_mov_st(dcontext,
OPND_CREATE_MEMPTR(REG_XSP, offs),
opnd_create_reg(scratch)));
POST(ilist, mark,
INSTR_CREATE_mov_ld(dcontext, opnd_create_reg
(shrink_reg_for_param(scratch, arg)),
arg));
} else {
POST(ilist, mark,
INSTR_CREATE_mov_st(dcontext,
OPND_CREATE_MEMPTR(REG_XSP, offs), arg));
}
#ifdef X64
}
#endif
}
}
}
if (!push && total_stack > 0) {
POST(ilist, prev, /* before everything else: pre-push and args */
/* can we use sub? may as well preserve eflags */
INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP),
OPND_CREATE_MEM_lea(REG_XSP, REG_NULL, 0,
-(int)total_stack)));
}
if (restore_xsp) {
/* before restore_xax, since we're going to clobber xax */
int disp = opnd_get_reg_dcontext_offs(REG_XSP);
instr_t *where = instr_get_next(prev);
/* skip rest of what prepare_for_clean_call adds */
disp += clean_call_beyond_mcontext();
insert_get_mcontext_base(dcontext, ilist, where, REG_XAX);
PRE(ilist, where, instr_create_restore_from_dc_via_reg
(dcontext, REG_XAX, REG_XAX, XSP_OFFSET));
PRE(ilist, where,
INSTR_CREATE_mov_st(dcontext, OPND_CREATE_MEMPTR(REG_XSP, disp),
opnd_create_reg(REG_XAX)));
/* now we need restore_xax to be AFTER this */
prev = instr_get_prev(where);
}
if (restore_xax) {
int disp = opnd_get_reg_dcontext_offs(REG_XAX);
/* skip rest of what prepare_for_clean_call adds */
disp += clean_call_beyond_mcontext();
POST(ilist, prev, /* before everything else: pre-push, args, and stack adjust */
INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_XAX),
OPND_CREATE_MEMPTR(REG_XSP, disp)));
}
return total_stack;
}
/* If jmp_instr == NULL, uses jmp_tag, otherwise uses jmp_instr
*/
void
insert_clean_call_with_arg_jmp_if_ret_true(dcontext_t *dcontext,
instrlist_t *ilist, instr_t *instr, void *callee, int arg,
app_pc jmp_tag, instr_t *jmp_instr)
{
instr_t *false_popa, *jcc;
prepare_for_clean_call(dcontext, NULL, ilist, instr);
dr_insert_call(dcontext, ilist, instr, callee, 1, OPND_CREATE_INT32(arg));
/* if the return value (xax) is 0, then jmp to internal false path */
PRE(ilist,instr, /* can't cmp w/ 64-bit immed so use test (shorter anyway) */
INSTR_CREATE_test(dcontext, opnd_create_reg(REG_XAX), opnd_create_reg(REG_XAX)));
/* fill in jcc target once have false path */
jcc = INSTR_CREATE_jcc(dcontext, OP_jz, opnd_create_pc(NULL));
PRE(ilist, instr, jcc);
/* if it falls through, then it's true, so restore and jmp to true tag
* passed in by caller
*/
cleanup_after_clean_call(dcontext, NULL, ilist, instr);
if (jmp_instr == NULL) {
/* an exit cti, not a meta instr */
instrlist_preinsert
(ilist, instr, INSTR_CREATE_jmp(dcontext, opnd_create_pc(jmp_tag)));
} else {
PRE(ilist, instr,
INSTR_CREATE_jmp(dcontext, opnd_create_instr(jmp_instr)));
}
/* otherwise (if returned false), just do standard popf and continue */
/* get 1st instr of cleanup path */
false_popa = instr_get_prev(instr);
cleanup_after_clean_call(dcontext, NULL, ilist, instr);
false_popa = instr_get_next(false_popa);
instr_set_target(jcc, opnd_create_instr(false_popa));
}
/* If !precise, encode_pc is treated as +- a page (meant for clients
* writing an instrlist to gencode so not sure of exact placement but
* within a page).
* If encode_pc == vmcode_get_start(), checks reachability of whole
* vmcode region (meant for code going somewhere not precisely known
* in the code cache).
* Returns whether ended up using a direct cti. If inlined_tgt_instr != NULL,
* and an inlined target was used, returns a pointer to that instruction
* in *inlined_tgt_instr.
*/
bool
insert_reachable_cti(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where,
byte *encode_pc, byte *target, bool jmp, bool precise,
reg_id_t scratch, instr_t **inlined_tgt_instr)
{
byte *encode_start;
byte *encode_end;
if (precise) {
encode_start = target + JMP_LONG_LENGTH;
encode_end = encode_start;
} else if (encode_pc == vmcode_get_start()) {
/* consider whole vmcode region */
encode_start = encode_pc;
encode_end = vmcode_get_end();
} else {
encode_start = (byte *) PAGE_START(encode_pc - PAGE_SIZE);
encode_end = (byte *) ALIGN_FORWARD(encode_pc + PAGE_SIZE, PAGE_SIZE);
}
if (REL32_REACHABLE(encode_start, target) &&
REL32_REACHABLE(encode_end, target)) {
/* For precise, we could consider a short cti, but so far no
* users are precise so we'll leave that for i#56.
*/
if (jmp)
PRE(ilist, where, INSTR_CREATE_jmp(dcontext, opnd_create_pc(target)));
else
PRE(ilist, where, INSTR_CREATE_call(dcontext, opnd_create_pc(target)));
return true;
} else {
opnd_t ind_tgt;
instr_t *inlined_tgt = NULL;
if (scratch == DR_REG_NULL) {
/* indirect through an inlined target */
inlined_tgt = instr_build_bits(dcontext, OP_UNDECODED, sizeof(target));
/* XXX: could use mov imm->xax and have target skip rex+opcode
* for clean disassembly
*/
instr_set_raw_bytes(inlined_tgt, (byte *) &target, sizeof(target));
/* this will copy the bytes for us, so we don't have to worry about
* the lifetime of the target param
*/
instr_allocate_raw_bits(dcontext, inlined_tgt, sizeof(target));
ind_tgt = opnd_create_mem_instr(inlined_tgt, 0, OPSZ_PTR);
if (inlined_tgt_instr != NULL)
*inlined_tgt_instr = inlined_tgt;
} else {
PRE(ilist, where, INSTR_CREATE_mov_imm
(dcontext, opnd_create_reg(scratch), OPND_CREATE_INTPTR(target)));
ind_tgt = opnd_create_reg(scratch);
if (inlined_tgt_instr != NULL)
*inlined_tgt_instr = NULL;
}
if (jmp)
PRE(ilist, where, INSTR_CREATE_jmp_ind(dcontext, ind_tgt));
else
PRE(ilist, where, INSTR_CREATE_call_ind(dcontext, ind_tgt));
if (inlined_tgt != NULL)
PRE(ilist, where, inlined_tgt);
return false;
}
}
/*###########################################################################
*###########################################################################
*
* M A N G L I N G R O U T I N E S
*/
/* If src_inst != NULL, uses it (and assumes it will be encoded at
* encode_estimate to determine whether > 32 bits or not: so if unsure where
* it will be encoded, pass a high address) as the immediate; else
* uses val.
*/
void
insert_mov_immed_arch(dcontext_t *dcontext, instr_t *src_inst, byte *encode_estimate,
ptr_int_t val, opnd_t dst,
instrlist_t *ilist, instr_t *instr,
instr_t **first, instr_t **second)
{
instr_t *mov1, *mov2;
if (src_inst != NULL)
val = (ptr_int_t) encode_estimate;
#ifdef X64
if (X64_MODE_DC(dcontext) && !opnd_is_reg(dst)) {
if (val <= INT_MAX && val >= INT_MIN) {
/* mov is sign-extended, so we can use one move if it is all
* 0 or 1 in top 33 bits
*/
mov1 = INSTR_CREATE_mov_imm(dcontext, dst,
(src_inst == NULL) ?
OPND_CREATE_INT32((int)val) :
opnd_create_instr_ex(src_inst, OPSZ_4, 0));
PRE(ilist, instr, mov1);
mov2 = NULL;
} else {
/* do mov-64-bit-immed in two pieces. tiny corner-case risk of racy
* access to [dst] if this thread is suspended in between or another
* thread is trying to read [dst], but o/w we have to spill and
* restore a register.
*/
CLIENT_ASSERT(opnd_is_memory_reference(dst), "invalid dst opnd");
/* mov low32 => [mem32] */
opnd_set_size(&dst, OPSZ_4);
mov1 = INSTR_CREATE_mov_st(dcontext, dst,
(src_inst == NULL) ?
OPND_CREATE_INT32((int)val) :
opnd_create_instr_ex(src_inst, OPSZ_4, 0));
PRE(ilist, instr, mov1);
/* mov high32 => [mem32+4] */
if (opnd_is_base_disp(dst)) {
int disp = opnd_get_disp(dst);
CLIENT_ASSERT(disp + 4 > disp, "disp overflow");
opnd_set_disp(&dst, disp+4);
} else {
byte *addr = opnd_get_addr(dst);
CLIENT_ASSERT(!POINTER_OVERFLOW_ON_ADD(addr, 4),
"addr overflow");
dst = OPND_CREATE_ABSMEM(addr+4, OPSZ_4);
}
mov2 = INSTR_CREATE_mov_st(dcontext, dst,
(src_inst == NULL) ?
OPND_CREATE_INT32((int)(val >> 32)) :
opnd_create_instr_ex(src_inst, OPSZ_4, 32));
PRE(ilist, instr, mov2);
}
} else {
#endif
mov1 = INSTR_CREATE_mov_imm(dcontext, dst,
(src_inst == NULL) ?
OPND_CREATE_INTPTR(val) :
opnd_create_instr_ex(src_inst, OPSZ_4, 0));
PRE(ilist, instr, mov1);
mov2 = NULL;
#ifdef X64
}
#endif
if (first != NULL)
*first = mov1;
if (second != NULL)
*second = mov2;
}
/* If src_inst != NULL, uses it (and assumes it will be encoded at
* encode_estimate to determine whether > 32 bits or not: so if unsure where
* it will be encoded, pass a high address) as the immediate; else
* uses val.
*/
void
insert_push_immed_arch(dcontext_t *dcontext, instr_t *src_inst, byte *encode_estimate,
ptr_int_t val, instrlist_t *ilist, instr_t *instr,
instr_t **first, instr_t **second)
{
instr_t *push, *mov;
if (src_inst != NULL)
val = (ptr_int_t) encode_estimate;
#ifdef X64
if (X64_MODE_DC(dcontext)) {
/* do push-64-bit-immed in two pieces. tiny corner-case risk of racy
* access to TOS if this thread is suspended in between or another
* thread is trying to read its stack, but o/w we have to spill and
* restore a register.
*/
push = INSTR_CREATE_push_imm(dcontext,
(src_inst == NULL) ?
OPND_CREATE_INT32((int)val) :
opnd_create_instr_ex(src_inst, OPSZ_4, 0));
PRE(ilist, instr, push);
/* push is sign-extended, so we can skip top half if it is all 0 or 1
* in top 33 bits
*/
if (val <= INT_MAX && val >= INT_MIN) {
mov = NULL;
} else {
mov = INSTR_CREATE_mov_st(dcontext, OPND_CREATE_MEM32(REG_XSP, 4),
(src_inst == NULL) ?
OPND_CREATE_INT32((int)(val >> 32)) :
opnd_create_instr_ex(src_inst, OPSZ_4, 32));
PRE(ilist, instr, mov);
}
} else {
#endif
push = INSTR_CREATE_push_imm(dcontext,
(src_inst == NULL) ?
OPND_CREATE_INT32(val) :
opnd_create_instr_ex(src_inst, OPSZ_4, 0));
PRE(ilist, instr, push);
mov = NULL;
#ifdef X64
}
#endif
if (first != NULL)
*first = push;
if (second != NULL)
*second = mov;
}
/* Far calls and rets have double total size */
static opnd_size_t
stack_entry_size(instr_t *instr, opnd_size_t opsize)
{
if (instr_get_opcode(instr) == OP_call_far ||
instr_get_opcode(instr) == OP_call_far_ind ||
instr_get_opcode(instr) == OP_ret_far) {
/* cut OPSZ_8_rex16_short4 in half */
if (opsize == OPSZ_4)
return OPSZ_2;
else if (opsize == OPSZ_8)
return OPSZ_4;
else {
#ifdef X64
ASSERT(opsize == OPSZ_16);
return OPSZ_8;
#else
ASSERT_NOT_REACHED();
#endif
}
} else if (instr_get_opcode(instr) == OP_iret) {
/* convert OPSZ_12_rex40_short6 */
if (opsize == OPSZ_6)
return OPSZ_2;
else if (opsize == OPSZ_12)
return OPSZ_4;
else {
#ifdef X64
ASSERT(opsize == OPSZ_40);
return OPSZ_8;
#else
ASSERT_NOT_REACHED();
#endif
}
}
return opsize;
}
/* Used for fault translation */
bool
instr_check_xsp_mangling(dcontext_t *dcontext, instr_t *inst, int *xsp_adjust)
{
ASSERT(xsp_adjust != NULL);
if (instr_get_opcode(inst) == OP_push ||
instr_get_opcode(inst) == OP_push_imm) {
LOG(THREAD_GET, LOG_INTERP, 4, "\tstate track: push or push_imm\n");
*xsp_adjust -= opnd_size_in_bytes(opnd_get_size(instr_get_dst(inst, 1)));
} else if (instr_get_opcode(inst) == OP_pop) {
LOG(THREAD_GET, LOG_INTERP, 4, "\tstate track: pop\n");
*xsp_adjust += opnd_size_in_bytes(opnd_get_size(instr_get_src(inst, 1)));
}
/* 1st part of push emulation from insert_push_retaddr */
else if (instr_get_opcode(inst) == OP_lea &&
opnd_get_reg(instr_get_dst(inst, 0)) == REG_XSP &&
opnd_get_base(instr_get_src(inst, 0)) == REG_XSP &&
opnd_get_index(instr_get_src(inst, 0)) == REG_NULL) {
LOG(THREAD_GET, LOG_INTERP, 4, "\tstate track: lea xsp adjust\n");
*xsp_adjust += opnd_get_disp(instr_get_src(inst, 0));
}
/* 2nd part of push emulation from insert_push_retaddr */
else if (instr_get_opcode(inst) == OP_mov_st &&
opnd_is_base_disp(instr_get_dst(inst, 0)) &&
opnd_get_base(instr_get_dst(inst, 0)) == REG_XSP &&
opnd_get_index(instr_get_dst(inst, 0)) == REG_NULL) {
LOG(THREAD_GET, LOG_INTERP, 4, "\tstate track: store to stack\n");
/* nothing to track: paired lea is what we undo */
}
/* retrieval of target for call* or jmp* */
else if ((instr_get_opcode(inst) == OP_movzx &&
reg_overlap(opnd_get_reg(instr_get_dst(inst, 0)), REG_XCX)) ||
(instr_get_opcode(inst) == OP_mov_ld &&
reg_overlap(opnd_get_reg(instr_get_dst(inst, 0)), REG_XCX))) {
LOG(THREAD_GET, LOG_INTERP, 4, "\tstate track: ib tgt to *cx\n");
/* nothing: our xcx spill restore will undo */
}
/* part of pop emulation for iretd/lretd in x64 mode */
else if (instr_get_opcode(inst) == OP_mov_ld &&
opnd_is_base_disp(instr_get_src(inst, 0)) &&
opnd_get_base(instr_get_src(inst, 0)) == REG_XSP &&
opnd_get_index(instr_get_src(inst, 0)) == REG_NULL) {
LOG(THREAD_GET, LOG_INTERP, 4, "\tstate track: load from stack\n");
/* nothing to track: paired lea is what we undo */
}
/* part of data16 ret. once we have cs preservation (PR 271317) we'll
* need to not fail when walking over a movzx to a pop cs (right now we
* do not read the stack for the pop cs).
*/
else if (instr_get_opcode(inst) == OP_movzx &&
opnd_get_reg(instr_get_dst(inst, 0)) == REG_CX) {
LOG(THREAD_GET, LOG_INTERP, 4, "\tstate track: movzx to cx\n");
/* nothing: our xcx spill restore will undo */
}
/* fake pop of cs for iret */
else if (instr_get_opcode(inst) == OP_add &&
opnd_is_reg(instr_get_dst(inst, 0)) &&
opnd_get_reg(instr_get_dst(inst, 0)) == REG_XSP &&
opnd_is_immed_int(instr_get_src(inst, 0))) {
LOG(THREAD_GET, LOG_INTERP, 4, "\tstate track: add to xsp\n");
ASSERT(CHECK_TRUNCATE_TYPE_int(opnd_get_immed_int(instr_get_src(inst, 0))));
*xsp_adjust += (int) opnd_get_immed_int(instr_get_src(inst, 0));
}
/* popf for iret */
else if (instr_get_opcode(inst) == OP_popf) {
LOG(THREAD_GET, LOG_INTERP, 4, "\tstate track: popf\n");
*xsp_adjust += opnd_size_in_bytes(opnd_get_size(instr_get_src(inst, 1)));
} else {
return false;
}
return true;
}
/* N.B.: keep in synch with instr_check_xsp_mangling() */
void
insert_push_retaddr(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
ptr_int_t retaddr, opnd_size_t opsize)
{
if (opsize == OPSZ_2) {
ptr_int_t val = retaddr & (ptr_int_t) 0x0000ffff;
/* can't do a non-default operand size with a push immed so we emulate */
PRE(ilist, instr,
INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP),
opnd_create_base_disp(REG_XSP, REG_NULL, 0, -2,
OPSZ_lea)));
PRE(ilist, instr,
INSTR_CREATE_mov_st(dcontext, OPND_CREATE_MEM16(REG_XSP, 2),
OPND_CREATE_INT16(val)));
} else if (opsize == OPSZ_PTR
IF_X64(|| (!X64_CACHE_MODE_DC(dcontext) && opsize == OPSZ_4))) {
insert_push_immed_ptrsz(dcontext, retaddr, ilist, instr, NULL, NULL);
} else {
#ifdef X64
ptr_int_t val = retaddr & (ptr_int_t) 0xffffffff;
ASSERT(opsize == OPSZ_4);
/* can't do a non-default operand size with a push immed so we emulate */
PRE(ilist, instr,
INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP),
opnd_create_base_disp(REG_XSP, REG_NULL, 0, -4,
OPSZ_lea)));
PRE(ilist, instr,
INSTR_CREATE_mov_st(dcontext, OPND_CREATE_MEM32(REG_XSP, 0),
OPND_CREATE_INT32((int)val)));
#else
ASSERT_NOT_REACHED();
#endif
}
}
#ifdef CLIENT_INTERFACE
/* N.B.: keep in synch with instr_check_xsp_mangling() */
static void
insert_mov_ptr_uint_beyond_TOS(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
ptr_int_t value, opnd_size_t opsize)
{
/* we insert non-meta b/c we want faults to go to app (should only fault
* if the ret itself faulted, barring races) for simplicity: o/w our
* our-mangling sequence gets broken up and more complex.
*/
if (opsize == OPSZ_2) {
ptr_int_t val = value & (ptr_int_t) 0x0000ffff;
PRE(ilist, instr,
INSTR_CREATE_mov_st(dcontext, OPND_CREATE_MEM16(REG_XSP, -2),
OPND_CREATE_INT16(val)));
} else if (opsize == OPSZ_4) {
ptr_int_t val = value & (ptr_int_t) 0xffffffff;
PRE(ilist, instr,
INSTR_CREATE_mov_st(dcontext, OPND_CREATE_MEM32(REG_XSP, -4),
OPND_CREATE_INT32(val)));
} else {
# ifdef X64
ptr_int_t val_low = value & (ptr_int_t) 0xffffffff;
ASSERT(opsize == OPSZ_8);
if (CHECK_TRUNCATE_TYPE_int(value)) {
/* prefer a single write w/ sign-extension */
PRE(ilist, instr,
INSTR_CREATE_mov_st(dcontext, OPND_CREATE_MEM64(REG_XSP, -8),
OPND_CREATE_INT32(val_low)));
} else {
/* we need two 32-bit writes */
ptr_int_t val_high = (value >> 32);
PRE(ilist, instr,
INSTR_CREATE_mov_st(dcontext, OPND_CREATE_MEM32(REG_XSP, -8),
OPND_CREATE_INT32(val_low)));
PRE(ilist, instr,
INSTR_CREATE_mov_st(dcontext, OPND_CREATE_MEM32(REG_XSP, -4),
OPND_CREATE_INT32(val_high)));
}
# else
ASSERT_NOT_REACHED();
# endif
}
}
#endif /* CLIENT_INTERFACE */
static void
insert_push_cs(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
ptr_int_t retaddr, opnd_size_t opsize)
{
#ifdef X64
if (X64_CACHE_MODE_DC(dcontext)) {
/* "push cs" is invalid; for now we push the typical cs values.
* i#823 covers doing this more generally.
*/
insert_push_retaddr(dcontext, ilist, instr,
X64_MODE_DC(dcontext) ? CS64_SELECTOR : CS32_SELECTOR, opsize);
} else {
#endif
opnd_t stackop;
/* we go ahead and push cs, but we won't pop into cs */
instr_t *push = INSTR_CREATE_push(dcontext, opnd_create_reg(SEG_CS));
/* 2nd dest is the stack operand size */
stackop = instr_get_dst(push, 1);
opnd_set_size(&stackop, opsize);
instr_set_dst(push, 1, stackop);
PRE(ilist, instr, push);
#ifdef X64
}
#endif
}
/* We spill to XCX(private dcontext) slot for private fragments,
* and to TLS MANGLE_XCX_SPILL_SLOT for shared fragments.
* (Except for DYNAMO_OPTION(private_ib_in_tls), for which all use tls,
* but that has a performance hit because of the extra data cache line)
* We can get away with the split by having the shared ibl routine copy
* xcx to the private dcontext, and by having the private ibl never
* target shared fragments.
* We also have to modify the xcx spill from tls to private dcontext when
* adding a shared basic block to a trace.
*
* FIXME: if we do make non-trace-head basic blocks valid indirect branch
* targets, we should have the private ibl have special code to test the
* flags and copy xcx to the tls slot if necessary.
*/
#define SAVE_TO_DC_OR_TLS(dc, flags, reg, tls_offs, dc_offs) \
((DYNAMO_OPTION(private_ib_in_tls) || TEST(FRAG_SHARED, (flags))) ? \
instr_create_save_to_tls(dc, reg, tls_offs) : \
instr_create_save_to_dcontext((dc), (reg), (dc_offs)))
#define SAVE_TO_DC_OR_TLS_OR_REG(dc, flags, reg, tls_offs, dc_offs, dest_reg) \
((X64_CACHE_MODE_DC(dc) && !X64_MODE_DC(dc) \
IF_X64(&& DYNAMO_OPTION(x86_to_x64_ibl_opt))) ? \
INSTR_CREATE_mov_ld(dc, opnd_create_reg(dest_reg), opnd_create_reg(reg)) : \
SAVE_TO_DC_OR_TLS(dc, flags, reg, tls_offs, dc_offs))
#define RESTORE_FROM_DC_OR_TLS(dc, flags, reg, tls_offs, dc_offs) \
((DYNAMO_OPTION(private_ib_in_tls) || TEST(FRAG_SHARED, (flags))) ? \
instr_create_restore_from_tls(dc, reg, tls_offs) : \
instr_create_restore_from_dcontext((dc), (reg), (dc_offs)))
static void
mangle_far_direct_helper(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
instr_t *next_instr, uint flags)
{
/* FIXME i#823: we do not support other than flat 0-based CS, DS, SS, and ES.
* If the app wants to change segments in a WOW64 process, we will
* do the right thing for standard cs selector values (xref i#49).
* For other cs changes or in other modes, we do go through far_ibl
* today although we do not enact the cs change (nor bother to pass
* the selector in xbx).
*
* For WOW64, I tried keeping this a direct jmp for nice linking by doing the
* mode change in-fragment and then using a 64-bit stub with a 32-bit fragment,
* but that gets messy b/c a lot of code assumes it can create or calculate the
* size of exit stubs given nothing but the fragment flags. I tried adding
* FRAG_ENDS_IN_FAR_DIRECT but still need to pass another param to all the stub
* macros and routines for mid-trace exits and for prefixes for -disable_traces.
* So, going for treating as indirect and using the far_ibl. It's a trace
* barrier anyway, and rare. We treat it as indirect in all modes (including
* x86 builds) for simplicity (and eventually for full i#823 we'll want
* to issue cs changes there too).
*/
app_pc pc = opnd_get_pc(instr_get_target(instr));
#ifdef X64
if (!X64_MODE_DC(dcontext) &&
opnd_get_segment_selector(instr_get_target(instr)) == CS64_SELECTOR) {
PRE(ilist, instr,
SAVE_TO_DC_OR_TLS_OR_REG(dcontext, flags, REG_XBX,
MANGLE_FAR_SPILL_SLOT, XBX_OFFSET, REG_R10));
PRE(ilist, instr,
INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_EBX),
OPND_CREATE_INT32(CS64_SELECTOR)));
}
#endif
PRE(ilist, instr,
SAVE_TO_DC_OR_TLS_OR_REG(dcontext, flags, REG_XCX,
MANGLE_XCX_SPILL_SLOT, XCX_OFFSET, REG_R9));
ASSERT((ptr_uint_t)pc < UINT_MAX); /* 32-bit code! */
PRE(ilist, instr,
INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_ECX),
OPND_CREATE_INT32((ptr_uint_t)pc)));
}
/***************************************************************************
* DIRECT CALL
* Returns new next_instr
*/
instr_t *
mangle_direct_call(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
instr_t *next_instr, bool mangle_calls, uint flags)
{
ptr_uint_t retaddr;
app_pc target = NULL;
opnd_t pushop = instr_get_dst(instr, 1);
opnd_size_t pushsz = stack_entry_size(instr, opnd_get_size(pushop));
if (opnd_is_near_pc(instr_get_target(instr)))
target = opnd_get_pc(instr_get_target(instr));
else if (opnd_is_instr(instr_get_target(instr))) {
instr_t *tgt = opnd_get_instr(instr_get_target(instr));
/* assumption: target's raw bits are meaningful */
target = instr_get_raw_bits(tgt);
ASSERT(target != 0);
/* FIXME case 6962: for far instr, we ignore the segment and
* assume it matches current cs */
} else if (opnd_is_far_pc(instr_get_target(instr))) {
target = opnd_get_pc(instr_get_target(instr));
/* FIXME case 6962: we ignore the segment and assume it matches current cs */
} else
ASSERT_NOT_REACHED();
if (!mangle_calls) {
/* off-trace call that will be executed natively */
/* relative target must be re-encoded */
instr_set_raw_bits_valid(instr, false);
#ifdef STEAL_REGISTER
/* FIXME: need to push edi prior to call and pop after.
* However, need to push edi prior to any args to this call,
* and it may be hard to find pre-arg-pushing spot...
* edi is supposed to be callee-saved, we're trusting this
* off-trace call to return, we may as well trust it to
* not trash edi -- these no-inline calls are dynamo's
* own routines, after all.
*/
#endif
return next_instr;
}
retaddr = get_call_return_address(dcontext, ilist, instr);
#ifdef CHECK_RETURNS_SSE2
/* ASSUMPTION: a call to the next instr is not going to ever have a
* matching ret! */
if (target == (app_pc)retaddr) {
LOG(THREAD, LOG_INTERP, 3, "found call to next instruction "PFX"\n", target);
} else {
check_return_handle_call(dcontext, ilist, next_instr);
}
/* now do the normal thing for a call */
#endif
if (instr_get_opcode(instr) == OP_call_far) {
/* N.B.: we do not support other than flat 0-based CS, DS, SS, and ES.
* if the app wants to change segments, we won't actually issue
* a segment change, and so will only work properly if the new segment
* is also 0-based. To properly issue new segments, we'd need a special
* ibl that ends in a far cti, and all prior address manipulations would
* need to be relative to the new segment, w/o messing up current segment.
* FIXME: can we do better without too much work?
* XXX: yes, for wow64: i#823: TODO mangle this like a far direct jmp
*/
SYSLOG_INTERNAL_WARNING_ONCE("Encountered a far direct call");
STATS_INC(num_far_dir_calls);
mangle_far_direct_helper(dcontext, ilist, instr, next_instr, flags);
insert_push_cs(dcontext, ilist, instr, 0, pushsz);
}
/* convert a direct call to a push of the return address */
insert_push_retaddr(dcontext, ilist, instr, retaddr, pushsz);
/* remove the call */
instrlist_remove(ilist, instr);
instr_destroy(dcontext, instr);
return next_instr;
}
#ifdef UNIX
/***************************************************************************
* Mangle the memory reference operand that uses fs/gs semgents,
* get the segment base of fs/gs into reg, and
* replace oldop with newop using reg instead of fs/gs
* The reg must not be used in the oldop, otherwise, the reg value
* is corrupted.
*/
static opnd_t
mangle_seg_ref_opnd(dcontext_t *dcontext, instrlist_t *ilist,
instr_t *where, opnd_t oldop, reg_id_t reg)
{
opnd_t newop;
reg_id_t seg;
ASSERT(opnd_is_far_base_disp(oldop));
seg = opnd_get_segment(oldop);
/* we only mangle fs/gs */
if (seg != SEG_GS && seg != SEG_FS)
return oldop;
#ifdef CLIENT_INTERFACE
if (seg == LIB_SEG_TLS && !INTERNAL_OPTION(private_loader))
return oldop;
#endif
/* The reg should not be used by the oldop */
ASSERT(!opnd_uses_reg(oldop, reg));
/* XXX: this mangling is pattern-matched in translation's instr_is_seg_ref_load() */
/* get app's segment base into reg. */
PRE(ilist, where,
instr_create_restore_from_tls(dcontext, reg,
os_get_app_seg_base_offset(seg)));
if (opnd_get_index(oldop) != REG_NULL &&
opnd_get_base(oldop) != REG_NULL) {
/* if both base and index are used, use
* lea [base, reg, 1] => reg
* to get the base + seg_base into reg.
*/
PRE(ilist, where,
INSTR_CREATE_lea(dcontext, opnd_create_reg(reg),
opnd_create_base_disp(opnd_get_base(oldop),
reg, 1, 0, OPSZ_lea)));
}
if (opnd_get_index(oldop) != REG_NULL) {
newop = opnd_create_base_disp(reg,
opnd_get_index(oldop),
opnd_get_scale(oldop),
opnd_get_disp(oldop),
opnd_get_size(oldop));
} else {
newop = opnd_create_base_disp(opnd_get_base(oldop),
reg, 1,
opnd_get_disp(oldop),
opnd_get_size(oldop));
}
return newop;
}
#endif /* UNIX */
/***************************************************************************
* INDIRECT CALL
*/
static reg_id_t
mangle_far_indirect_helper(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
instr_t *next_instr, uint flags, opnd_t *target_out)
{
opnd_t target = *target_out;
opnd_size_t addr_size;
reg_id_t reg_target = REG_NULL;
ASSERT(instr_get_opcode(instr) == OP_jmp_far_ind ||
instr_get_opcode(instr) == OP_call_far_ind);
/* FIXME i#823: we do not support other than flat 0-based CS, DS, SS, and ES.
* If the app wants to change segments in a WOW64 process, we will
* do the right thing for standard cs selector values (xref i#49).
* For other cs changes or in other modes, we do go through far_ibl
* today although we do not enact the cs change (nor bother to pass
* the selector in xbx).
*/
/* opnd type is i_Ep, it's not a far base disp b/c segment is at
* memory location, not specified as segment prefix on instr
* we assume register operands are marked as invalid instrs long
* before this point.
*/
ASSERT(opnd_is_base_disp(target));
/* Segment selector is the final 2 bytes.
* For non-mixed-mode, we ignore it.
* We assume DS base == target cti CS base.
*/
/* if data16 then just 2 bytes for address
* if x64 mode and Intel and rex then 8 bytes for address */
ASSERT((X64_MODE_DC(dcontext) && opnd_get_size(target) == OPSZ_10 &&
proc_get_vendor() != VENDOR_AMD) ||
opnd_get_size(target) == OPSZ_6 || opnd_get_size(target) == OPSZ_4);
if (opnd_get_size(target) == OPSZ_10) {
addr_size = OPSZ_8;
reg_target = REG_RCX;
} else if (opnd_get_size(target) == OPSZ_6) {
addr_size = OPSZ_4;
reg_target = REG_ECX;
} else /* target has OPSZ_4 */ {
addr_size = OPSZ_2;
reg_target = REG_XCX; /* caller uses movzx so size doesn't have to match */
}
#ifdef X64
if (mixed_mode_enabled()) {
/* While we don't support arbitrary segments, we do support
* mode changes using standard cs selector values (i#823).
* We save the selector into xbx.
*/
opnd_t sel = target;
opnd_set_disp(&sel, opnd_get_disp(target) + opnd_size_in_bytes(addr_size));
opnd_set_size(&sel, OPSZ_2);
/* all scratch space should be in TLS only */
ASSERT(TEST(FRAG_SHARED, flags) || DYNAMO_OPTION(private_ib_in_tls));
PRE(ilist, instr,
SAVE_TO_DC_OR_TLS_OR_REG(dcontext, flags, REG_XBX,
MANGLE_FAR_SPILL_SLOT, XBX_OFFSET, REG_R10));
PRE(ilist, instr,
INSTR_CREATE_movzx(dcontext, opnd_create_reg(REG_EBX), sel));
if (instr_uses_reg(instr, REG_XBX)) {
/* instr can't be both riprel (uses xax slot for mangling) and use
* a register, so we spill to the riprel (== xax) slot
*/
PRE(ilist, instr,
SAVE_TO_DC_OR_TLS(dcontext, flags, REG_XBX, MANGLE_RIPREL_SPILL_SLOT,
XAX_OFFSET));
POST(ilist, instr,
instr_create_restore_from_tls(dcontext, REG_XBX,
MANGLE_RIPREL_SPILL_SLOT));
}
}
#endif
opnd_set_size(target_out, addr_size);
return reg_target;
}
void
mangle_indirect_call(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
instr_t *next_instr, bool mangle_calls, uint flags)
{
opnd_t target;
ptr_uint_t retaddr;
opnd_t pushop = instr_get_dst(instr, 1);
opnd_size_t pushsz = stack_entry_size(instr, opnd_get_size(pushop));
reg_id_t reg_target = REG_XCX;
if (!mangle_calls)
return;
retaddr = get_call_return_address(dcontext, ilist, instr);
/* Convert near, indirect calls. The jump to the exit_stub that
* jumps to indirect_branch_lookup was already inserted into the
* instr list by interp EXCEPT for the case in which we're converting
* an indirect call to a direct call. In that case, mangle later
* inserts a direct exit stub.
*/
/* If this call is marked for conversion, do minimal processing.
* FIXME Just a note that converted calls are not subjected to any of
* the specialized builds' processing further down.
*/
if (TEST(INSTR_IND_CALL_DIRECT, instr->flags)) {
/* convert the call to a push of the return address */
insert_push_retaddr(dcontext, ilist, instr, retaddr, pushsz);
/* remove the call */
instrlist_remove(ilist, instr);
instr_destroy(dcontext, instr);
return;
}
/* put the push AFTER the instruction that calculates
* the target, b/c if target depends on xsp we must use
* the value of xsp prior to this call instruction!
* we insert before next_instr to accomplish this.
*/
if (instr_get_opcode(instr) == OP_call_far_ind) {
/* goes right before the push of the ret addr */
insert_push_cs(dcontext, ilist, next_instr, 0, pushsz);
/* see notes below -- we don't really support switching segments,
* though we do go ahead and push cs, we won't pop into cs
*/
}
insert_push_retaddr(dcontext, ilist, next_instr, retaddr, pushsz);
/* save away xcx so that we can use it */
/* (it's restored in x86.s (indirect_branch_lookup) */
PRE(ilist, instr,
SAVE_TO_DC_OR_TLS_OR_REG(dcontext, flags, REG_XCX,
MANGLE_XCX_SPILL_SLOT, XCX_OFFSET, REG_R9));
#ifdef STEAL_REGISTER
/* Steal edi if call uses it, using original call instruction */
steal_reg(dcontext, instr, ilist);
if (ilist->flags)
restore_state(dcontext, next_instr, ilist);
/* It's impossible for our register stealing to use ecx
* because no call can simultaneously use 3 registers, right?
* Maximum is 2, in something like "call *(edi,ecx,4)"?
* If it is possible, need to make sure stealing's use of ecx
* doesn't conflict w/ our use
*/
#endif
/* change: call /2, Ev -> movl Ev, %xcx */
target = instr_get_src(instr, 0);
if (instr_get_opcode(instr) == OP_call_far_ind) {
SYSLOG_INTERNAL_WARNING_ONCE("Encountered a far indirect call");
STATS_INC(num_far_ind_calls);
reg_target = mangle_far_indirect_helper(dcontext, ilist, instr,
next_instr, flags, &target);
}
#ifdef UNIX
/* i#107, mangle the memory reference opnd that uses segment register. */
if (INTERNAL_OPTION(mangle_app_seg) && opnd_is_far_base_disp(target)) {
/* FIXME: we use REG_XCX to store the segment base, which might be used
* in target and cause assertion failure in mangle_seg_ref_opnd.
*/
ASSERT_BUG_NUM(107, !opnd_uses_reg(target, REG_XCX));
target = mangle_seg_ref_opnd(dcontext, ilist, instr, target, REG_XCX);
}
#endif
/* cannot call instr_reset, it will kill prev & next ptrs */
instr_free(dcontext, instr);
instr_set_num_opnds(dcontext, instr, 1, 1);
instr_set_opcode(instr, opnd_get_size(target) == OPSZ_2 ? OP_movzx : OP_mov_ld);
instr_set_dst(instr, 0, opnd_create_reg(reg_target));
instr_set_src(instr, 0, target); /* src stays the same */
if (instrlist_get_translation_target(ilist) != NULL) {
/* make sure original raw bits are used for translation */
instr_set_translation(instr, instr_get_raw_bits(instr));
}
instr_set_our_mangling(instr, true);
#ifdef CHECK_RETURNS_SSE2
check_return_handle_call(dcontext, ilist, next_instr);
#endif
}
/***************************************************************************
* RETURN
*/
#ifdef X64
/* Saves the selector from the top of the stack into xbx, after spilling xbx,
* for far_ibl.
*/
static void
mangle_far_return_save_selector(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
uint flags)
{
if (mixed_mode_enabled()) {
/* While we don't support arbitrary segments, we do support
* mode changes using standard cs selector values (i#823).
* We save the selector into xbx.
*/
/* We could do a pop but state xl8 is already set up to restore lea */
/* all scratch space should be in TLS only */
ASSERT(TEST(FRAG_SHARED, flags) || DYNAMO_OPTION(private_ib_in_tls));
PRE(ilist, instr,
SAVE_TO_DC_OR_TLS_OR_REG(dcontext, flags, REG_XBX,
MANGLE_FAR_SPILL_SLOT, XBX_OFFSET, REG_R10));
PRE(ilist, instr,
INSTR_CREATE_movzx(dcontext, opnd_create_reg(REG_EBX),
OPND_CREATE_MEM16(REG_XSP, 0)));
}
}
#endif
void
mangle_return(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
instr_t *next_instr, uint flags)
{
instr_t *pop;
opnd_t retaddr;
opnd_size_t retsz;
#ifdef CHECK_RETURNS_SSE2
check_return_handle_return(dcontext, ilist, next_instr);
/* now do the normal ret mangling */
#endif
/* Convert returns. If aggressive we could take advantage of the
* fact that xcx is dead at the return and not bother saving it?
* The jump to the exit_stub that jumps to indirect_branch_lookup
* was already inserted into the instr list by interp. */
/* save away xcx so that we can use it */
/* (it's restored in x86.s (indirect_branch_lookup) */
PRE(ilist, instr,
SAVE_TO_DC_OR_TLS_OR_REG(dcontext, flags, REG_XCX,
MANGLE_XCX_SPILL_SLOT, XCX_OFFSET, REG_R9));
/* see if ret has an immed int operand, assumed to be 1st src */
if (instr_num_srcs(instr) > 0 && opnd_is_immed_int(instr_get_src(instr, 0))) {
/* if has an operand, return removes some stack space,
* AFTER the return address is popped
*/
int val = (int) opnd_get_immed_int(instr_get_src(instr, 0));
IF_X64(ASSERT_TRUNCATE(val, int, opnd_get_immed_int(instr_get_src(instr, 0))));
/* addl sizeof_param_area, %xsp
* except that clobbers the flags, so we use lea */
PRE(ilist, next_instr,
INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP),
opnd_create_base_disp(REG_XSP, REG_NULL, 0, val, OPSZ_lea)));
}
/* don't need to steal edi since return cannot use registers */
/* the retaddr operand is always the final source for all OP_ret* instrs */
retaddr = instr_get_src(instr, instr_num_srcs(instr) - 1);
retsz = stack_entry_size(instr, opnd_get_size(retaddr));
if (X64_CACHE_MODE_DC(dcontext) && retsz == OPSZ_4) {
if (instr_get_opcode(instr) == OP_iret || instr_get_opcode(instr) == OP_ret_far) {
/* N.B.: For some unfathomable reason iret and ret_far default to operand
* size 4 in 64-bit mode (making them, along w/ call_far, the only stack
* operation instructions to do so). So if we see an iret or far ret with
* OPSZ_4 in 64-bit mode we need a 4-byte pop, but since we can't actually
* generate a 4-byte pop we have to emulate it here. */
SYSLOG_INTERNAL_WARNING_ONCE("Encountered iretd/lretd in 64-bit mode!");
}
/* Note moving into ecx automatically zero extends which is what we want. */
PRE(ilist, instr,
INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_ECX),
OPND_CREATE_MEM32(REG_RSP, 0)));
/* iret could use add since going to pop the eflags, but not lret.
* lret could combine w/ segment lea below: but not perf-crit instr, and
* anticipating cs preservation PR 271317 I'm leaving separate. */
PRE(ilist, instr,
INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP),
opnd_create_base_disp(REG_XSP, REG_NULL, 0, 4, OPSZ_lea)));
} else {
/* change RET into a POP, keeping the operand size */
opnd_t memop = retaddr;
pop = INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_XCX));
/* need per-entry size, not total size (double for far ret) */
opnd_set_size(&memop, retsz);
instr_set_src(pop, 1, memop);
if (retsz == OPSZ_2)
instr_set_dst(pop, 0, opnd_create_reg(REG_CX));
/* We can't do a 4-byte pop in 64-bit mode, but excepting iretd and lretd
* handled above we should never see one. */
ASSERT(!X64_MODE_DC(dcontext) || retsz != OPSZ_4);
PRE(ilist, instr, pop);
if (retsz == OPSZ_2) {
/* we need to zero out the top 2 bytes */
PRE(ilist, instr, INSTR_CREATE_movzx
(dcontext,
opnd_create_reg(REG_ECX), opnd_create_reg(REG_CX)));
}
}
#ifdef CLIENT_INTERFACE
if (TEST(INSTR_CLOBBER_RETADDR, instr->flags)) {
/* we put the value in the note field earlier */
ptr_uint_t val = (ptr_uint_t) instr->note;
insert_mov_ptr_uint_beyond_TOS(dcontext, ilist, instr, val, retsz);
}
#endif
if (instr_get_opcode(instr) == OP_ret_far) {
/* FIXME i#823: we do not support other than flat 0-based CS, DS, SS, and ES.
* If the app wants to change segments in a WOW64 process, we will
* do the right thing for standard cs selector values (xref i#49).
* For other cs changes or in other modes, we do go through far_ibl
* today although we do not enact the cs change (nor bother to pass
* the selector in xbx).
*/
SYSLOG_INTERNAL_WARNING_ONCE("Encountered a far ret");
STATS_INC(num_far_rets);
#ifdef X64
mangle_far_return_save_selector(dcontext, ilist, instr, flags);
#endif
/* pop selector from stack, but not into cs, just junk it
* (the 16-bit selector is expanded to 32 bits on the push, unless data16)
*/
PRE(ilist, instr,
INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP),
opnd_create_base_disp
(REG_XSP, REG_NULL, 0,
opnd_size_in_bytes(retsz), OPSZ_lea)));
}
if (instr_get_opcode(instr) == OP_iret) {
instr_t *popf;
/* Xref PR 215553 and PR 191977 - we actually see this on 64-bit Vista */
LOG(THREAD, LOG_INTERP, 2, "Encountered iret at "PFX" - mangling\n",
instr_get_translation(instr));
STATS_INC(num_irets);
/* In 32-bit mode this is a pop->EIP pop->CS pop->eflags.
* 64-bit mode (with either 32-bit or 64-bit operand size,
* despite the (wrong) Intel manual pseudocode: see i#833 and
* the win32.mixedmode test) extends
* the above and additionally adds pop->RSP pop->ss. N.B.: like OP_far_ret we
* ignore the CS (except mixed-mode WOW64) and SS segment changes
* (see the comments there).
*/
#ifdef X64
mangle_far_return_save_selector(dcontext, ilist, instr, flags);
#endif
/* Return address is already popped, next up is CS segment which we ignore
* (unless in mixed-mode, handled above) so
* adjust stack pointer. Note we can use an add here since the eflags will
* be written below. */
PRE(ilist, instr,
INSTR_CREATE_add(dcontext, opnd_create_reg(REG_XSP),
OPND_CREATE_INT8
(opnd_size_in_bytes(retsz))));
/* Next up is xflags, we use a popf. Popf should be setting the right flags
* (it's difficult to tell because in the docs iret lists the flags it does
* set while popf lists the flags it doesn't set). The docs aren't entirely
* clear, but any flag that we or a user mode program would care about should
* be right. */
popf = INSTR_CREATE_popf(dcontext);
if (X64_CACHE_MODE_DC(dcontext) && retsz == OPSZ_4) {
/* We can't actually create a 32-bit popf and there's no easy way to
* simulate one. For now we'll do a 64-bit popf and fixup the stack offset.
* If AMD/INTEL ever start using the top half of the rflags register then
* we could have problems here. We could also break stack transparency and
* do a mov, push, popf to zero extend the value. */
PRE(ilist, instr, popf);
/* flags are already set, must use lea to fix stack */
PRE(ilist, instr,
INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP),
opnd_create_base_disp
(REG_XSP, REG_NULL, 0, -4, OPSZ_lea)));
} else {
/* get popf size right the same way we do it for the return address */
opnd_t memop = retaddr;
opnd_set_size(&memop, retsz);
DOCHECK(1, if (retsz == OPSZ_2)
ASSERT_NOT_TESTED(););
instr_set_src(popf, 1, memop);
PRE(ilist, instr, popf);
}
#ifdef X64
/* In x64 mode, iret additionally does pop->RSP and pop->ss. */
if (X64_MODE_DC(dcontext)) {
if (retsz == OPSZ_8)
PRE(ilist, instr, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_RSP)));
else if (retsz == OPSZ_4) {
PRE(ilist, instr, INSTR_CREATE_mov_ld
(dcontext, opnd_create_reg(REG_ESP), OPND_CREATE_MEM32(REG_RSP, 0)));
} else {
ASSERT_NOT_TESTED();
PRE(ilist, instr, INSTR_CREATE_movzx
(dcontext, opnd_create_reg(REG_ESP), OPND_CREATE_MEM16(REG_RSP, 0)));
}
/* We're ignoring the set of SS and since we just set RSP we don't need
* to do anything to adjust the stack for the pop (since the pop would have
* occurred with the old RSP). */
}
#endif
}
/* remove the ret */
instrlist_remove(ilist, instr);
instr_destroy(dcontext, instr);
}
/***************************************************************************
* INDIRECT JUMP
*/
void
mangle_indirect_jump(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
instr_t *next_instr, uint flags)
{
opnd_t target;
reg_id_t reg_target = REG_XCX;
/* Convert indirect branches (that are not returns). Again, the
* jump to the exit_stub that jumps to indirect_branch_lookup
* was already inserted into the instr list by interp. */
/* save away xcx so that we can use it */
/* (it's restored in x86.s (indirect_branch_lookup) */
PRE(ilist, instr,
SAVE_TO_DC_OR_TLS_OR_REG(dcontext, flags, REG_XCX,
MANGLE_XCX_SPILL_SLOT, XCX_OFFSET, REG_R9));
#ifdef STEAL_REGISTER
/* Steal edi if branch uses it, using original instruction */
steal_reg(dcontext, instr, ilist);
if (ilist->flags)
restore_state(dcontext, next_instr, ilist);
#endif
/* change: jmp /4, i_Ev -> movl i_Ev, %xcx */
target = instr_get_target(instr);
if (instr_get_opcode(instr) == OP_jmp_far_ind) {
SYSLOG_INTERNAL_WARNING_ONCE("Encountered a far indirect jump");
STATS_INC(num_far_ind_jmps);
reg_target = mangle_far_indirect_helper(dcontext, ilist, instr,
next_instr, flags, &target);
}
#ifdef UNIX
/* i#107, mangle the memory reference opnd that uses segment register. */
if (INTERNAL_OPTION(mangle_app_seg) && opnd_is_far_base_disp(target)) {
/* FIXME: we use REG_XCX to store segment base, which might be used
* in target and cause assertion failure in mangle_seg_ref_opnd.
*/
ASSERT_BUG_NUM(107, !opnd_uses_reg(target, REG_XCX));
target = mangle_seg_ref_opnd(dcontext, ilist, instr, target, REG_XCX);
}
#endif
/* cannot call instr_reset, it will kill prev & next ptrs */
instr_free(dcontext, instr);
instr_set_num_opnds(dcontext, instr, 1, 1);
instr_set_opcode(instr, opnd_get_size(target) == OPSZ_2 ? OP_movzx : OP_mov_ld);
instr_set_dst(instr, 0, opnd_create_reg(reg_target));
instr_set_src(instr, 0, target); /* src stays the same */
if (instrlist_get_translation_target(ilist) != NULL) {
/* make sure original raw bits are used for translation */
instr_set_translation(instr, instr_get_raw_bits(instr));
}
instr_set_our_mangling(instr, true);
/* It's impossible for our register stealing to use ecx
* because no branch can simultaneously use 3 registers, right?
* Maximum is 2, in something like "jmp *(edi,ebx,4)"?
* If it is possible, need to make sure stealing's use of ecx
* doesn't conflict w/ our use = FIXME
*/
}
/***************************************************************************
* FAR DIRECT JUMP
*/
void
mangle_far_direct_jump(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
instr_t *next_instr, uint flags)
{
SYSLOG_INTERNAL_WARNING_ONCE("Encountered a far direct jmp");
STATS_INC(num_far_dir_jmps);
mangle_far_direct_helper(dcontext, ilist, instr, next_instr, flags);
instrlist_remove(ilist, instr);
instr_destroy(dcontext, instr);
}
/***************************************************************************
* SYSCALL
*/
#ifdef CLIENT_INTERFACE
static bool
cti_is_normal_elision(instr_t *instr)
{
instr_t *next;
opnd_t tgt;
app_pc next_pc;
if (instr == NULL || instr_is_meta(instr))
return false;
if (!instr_is_ubr(instr) && !instr_is_call_direct(instr))
return false;
next = instr_get_next(instr);
if (next == NULL || instr_is_meta(next))
return false;
tgt = instr_get_target(instr);
next_pc = instr_get_translation(next);
if (next_pc == NULL && instr_raw_bits_valid(next))
next_pc = instr_get_raw_bits(next);
if (opnd_is_pc(tgt) && next_pc != NULL && opnd_get_pc(tgt) == next_pc)
return true;
return false;
}
#endif
/* Tries to statically find the syscall number for the
* syscall instruction instr.
* Returns -1 upon failure.
* Note that on MacOS, 32-bit Mach syscalls are encoded using negative numbers
* (although -1 is invalid), so be sure to test for -1 and not just <0 as a failure
* code.
*/
int
find_syscall_num(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr)
{
int syscall = -1;
instr_t *prev = instr_get_prev(instr);
if (prev != NULL) {
prev = instr_get_prev_expanded(dcontext, ilist, instr);
/* walk backwards looking for "mov_imm imm->xax"
* may be other instrs placing operands into registers
* for the syscall in between
*/
while (prev != NULL &&
instr_num_dsts(prev) > 0 &&
opnd_is_reg(instr_get_dst(prev, 0)) &&
#ifdef X64
opnd_get_reg(instr_get_dst(prev, 0)) != REG_RAX &&
#endif
opnd_get_reg(instr_get_dst(prev, 0)) != REG_EAX) {
#ifdef CLIENT_INTERFACE
/* if client added cti in between, bail and assume non-ignorable */
if (instr_is_cti(prev) &&
!(cti_is_normal_elision(prev)
IF_WINDOWS(|| instr_is_call_sysenter_pattern
(prev, instr_get_next(prev), instr))))
return -1;
#endif
prev = instr_get_prev_expanded(dcontext, ilist, prev);
}
if (prev != NULL &&
instr_get_opcode(prev) == OP_mov_imm &&
(IF_X64_ELSE(opnd_get_reg(instr_get_dst(prev, 0)) == REG_RAX, true) ||
opnd_get_reg(instr_get_dst(prev, 0)) == REG_EAX)) {
#ifdef CLIENT_INTERFACE
instr_t *walk, *tgt;
#endif
IF_X64(ASSERT_TRUNCATE(int, int,
opnd_get_immed_int(instr_get_src(prev, 0))));
syscall = (int) opnd_get_immed_int(instr_get_src(prev, 0));
#ifdef CLIENT_INTERFACE
/* if client added cti target in between, bail and assume non-ignorable */
for (walk = instrlist_first_expanded(dcontext, ilist);
walk != NULL;
walk = instr_get_next_expanded(dcontext, ilist, walk)) {
if (instr_is_cti(walk) && opnd_is_instr(instr_get_target(walk))) {
for (tgt = opnd_get_instr(instr_get_target(walk));
tgt != NULL;
tgt = instr_get_next_expanded(dcontext, ilist, tgt)) {
if (tgt == prev)
break;
if (tgt == instr)
return -1;
}
}
}
#endif
}
}
return syscall;
}
#ifdef UNIX
/* Inserts code to handle clone into ilist.
* instr is the syscall instr itself.
* Assumes that instructions exist beyond instr in ilist.
* pc_to_ecx is an instr that puts the pc after the app's syscall instr
* into xcx.
* skip decides whether the clone code is skipped by default or not.
*
* N.B.: mangle_clone_code() makes assumptions about this exact code layout
*
* CAUTION: don't use a lot of stack in the generated code because
* get_clone_record() makes assumptions about the usage of stack being
* less than a page.
*/
void
mangle_insert_clone_code(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
bool skip _IF_X64(gencode_mode_t mode))
{
/*
int 0x80
.if don't know sysnum statically:
jmp ignore <-- modifiable jmp
.else
jmp xchg # need this so can jmp to ignore if !CLONE_VM
.endif
xchg:
xchg xax,xcx
jecxz child
jmp parent
child:
# i#149/PR 403015: the child is on the dstack so no need to swap stacks
jmp new_thread_dynamo_start
parent:
xchg xax,xcx
ignore:
<post system call, etc.>
*/
instr_t *in = instr_get_next(instr);
instr_t *xchg = INSTR_CREATE_label(dcontext);
instr_t *child = INSTR_CREATE_label(dcontext);
instr_t *parent = INSTR_CREATE_label(dcontext);
ASSERT(in != NULL);
/* we have to dynamically skip or not skip the clone code
* see mangle_clone_code below
*/
if (skip) {
/* Insert a jmp that normally skips the clone stuff,
* pre_system_call will modify it if it really is SYS_clone.
*/
PRE(ilist, in,
INSTR_CREATE_jmp(dcontext, opnd_create_instr(in)));
} else {
/* We have to do this even if we statically know the sysnum
* because if CLONE_VM is not set this is a fork, and we then
* want to skip our clone code.
*/
PRE(ilist, in,
INSTR_CREATE_jmp(dcontext, opnd_create_instr(xchg)));
}
PRE(ilist, in, xchg);
PRE(ilist, in, INSTR_CREATE_xchg(dcontext, opnd_create_reg(REG_XAX),
opnd_create_reg(REG_XCX)));
PRE(ilist, in,
INSTR_CREATE_jecxz(dcontext, opnd_create_instr(child)));
PRE(ilist, in,
INSTR_CREATE_jmp(dcontext, opnd_create_instr(parent)));
PRE(ilist, in, child);
/* We used to insert this directly into fragments for inlined system
* calls, but not once we eliminated clean calls out of the DR cache
* for security purposes. Thus it can be a meta jmp, or an indirect jmp.
*/
insert_reachable_cti(dcontext, ilist, in, vmcode_get_start(),
(byte *) get_new_thread_start(dcontext _IF_X64(mode)),
true/*jmp*/, false/*!precise*/, DR_REG_NULL/*no scratch*/,
NULL);
instr_set_meta(instr_get_prev(in));
PRE(ilist, in, parent);
PRE(ilist, in, INSTR_CREATE_xchg(dcontext, opnd_create_reg(REG_XAX),
opnd_create_reg(REG_XCX)));
}
/* find the system call number in instrlist for an inlined system call
* by simpling walking the ilist backward and finding "mov immed => %eax"
* without checking cti or expanding instr
*/
int
ilist_find_sysnum(instrlist_t *ilist, instr_t *instr)
{
for (; instr != NULL; instr = instr_get_prev(instr)) {
if (instr_is_app(instr) &&
instr_get_opcode(instr) == OP_mov_imm &&
opnd_is_reg(instr_get_dst(instr, 0)) &&
opnd_get_reg(instr_get_dst(instr, 0)) == REG_EAX &&
opnd_is_immed_int(instr_get_src(instr, 0)))
return (int) opnd_get_immed_int(instr_get_src(instr, 0));
}
ASSERT_NOT_REACHED();
return -1;
}
#endif /* UNIX */
#ifdef WINDOWS
/* Note that ignore syscalls processing for XP and 2003 is a two-phase operation.
* For this reason, mangle_syscall() might be called with a 'next_instr' that's
* not an original app instruction but one inserted by the earlier mangling phase.
*/
#endif
void
mangle_syscall(dcontext_t *dcontext, instrlist_t *ilist, uint flags,
instr_t *instr, instr_t *next_instr)
{
#ifdef UNIX
if (get_syscall_method() != SYSCALL_METHOD_INT &&
get_syscall_method() != SYSCALL_METHOD_SYSCALL &&
get_syscall_method() != SYSCALL_METHOD_SYSENTER) {
/* don't know convention on return address from kernel mode! */
SYSLOG_INTERNAL_ERROR("unsupported system call method");
LOG(THREAD, LOG_INTERP, 1, "don't know convention for this syscall method\n");
CLIENT_ASSERT(false, "Unsupported system call method detected. Please "
"reboot with the nosep kernel option if this is a 32-bit "
"2.5 or 2.6 version Linux kernel.");
}
/* cannot use dynamo stack in code cache, so we cannot insert a
* call -- instead we have interp end bbs at interrupts unless
* we can identify them as ignorable system calls. Otherwise,
* we just remove the instruction and jump back to dynamo to
* handle it.
*/
if (TESTANY(INSTR_NI_SYSCALL_ALL, instr->flags)) {
instrlist_remove(ilist, instr);
instr_destroy(dcontext, instr);
return;
}
/* signal barrier: need to be able to exit fragment immediately
* prior to syscall, so we set up an exit cti with a jmp right beforehand
* that by default hops over the exit cti.
* when we want to exit right before the syscall, we call the
* mangle_syscall_code() routine below.
*/
instr_t *skip_exit = INSTR_CREATE_label(dcontext);
PRE(ilist, instr, INSTR_CREATE_jmp_short(dcontext, opnd_create_instr(skip_exit)));
/* assumption: raw bits of instr == app pc */
ASSERT(instr_get_raw_bits(instr) != NULL);
/* this should NOT be a meta-instr so we don't use PRE */
/* note that it's ok if this gets linked: we unlink all outgoing exits in
* addition to changing the skip_exit jmp upon receiving a signal
*/
instrlist_preinsert(ilist, instr, INSTR_CREATE_jmp
(dcontext, opnd_create_pc(instr_get_raw_bits(instr))));
PRE(ilist, instr, skip_exit);
if (does_syscall_ret_to_callsite() &&
sysnum_is_not_restartable(ilist_find_sysnum(ilist, instr))) {
/* i#1216: we insert a nop instr right after inlined non-auto-restart
* syscall to make it a safe point for suspending.
* XXX-i#1216-c#2: we still need handle auto-restart syscall
*/
instr_t *nop = INSTR_CREATE_nop(dcontext);
/* We make a fake app nop instr for easy handling in recreate_app_state.
* XXX: it is cleaner to mark our-mangling and handle it, but it seems
* ok to use a fake app nop instr, since the client won't see it.
*/
INSTR_XL8(nop, (instr_get_translation(instr) +
instr_length(dcontext, instr)));
instr_set_app(instr);
instrlist_postinsert(ilist, instr, nop);
}
# ifdef MACOS
if (instr_get_opcode(instr) == OP_sysenter) {
/* The kernel returns control to whatever user-mode places in edx.
* We get control back here and then go to the ret ibl (since normally
* there's a call to a shared routine that does "pop edx").
*/
instr_t *post_sysenter = INSTR_CREATE_label(dcontext);
PRE(ilist, instr,
SAVE_TO_DC_OR_TLS(dcontext, flags, REG_XDX, TLS_XDX_SLOT, XDX_OFFSET));
instrlist_insert_mov_instr_addr(dcontext, post_sysenter, NULL/*in cache*/,
opnd_create_reg(REG_XDX),
ilist, instr, NULL, NULL);
/* sysenter goes here */
PRE(ilist, next_instr, post_sysenter);
PRE(ilist, next_instr,
RESTORE_FROM_DC_OR_TLS(dcontext, flags, REG_XDX, TLS_XDX_SLOT, XDX_OFFSET));
PRE(ilist, next_instr,
SAVE_TO_DC_OR_TLS(dcontext, flags, REG_XCX, TLS_XCX_SLOT, XCX_OFFSET));
PRE(ilist, next_instr,
INSTR_CREATE_mov_st(dcontext, opnd_create_reg(REG_XCX),
opnd_create_reg(REG_XDX)));
} else if (TEST(INSTR_BRANCH_SPECIAL_EXIT, instr->flags)) {
int num = instr_get_interrupt_number(instr);
ASSERT(instr_get_opcode(instr) == OP_int);
if (num == 0x81 || num == 0x82) {
int reason = (num == 0x81) ? EXIT_REASON_NI_SYSCALL_INT_0x81 :
EXIT_REASON_NI_SYSCALL_INT_0x82;
if (DYNAMO_OPTION(private_ib_in_tls) || TEST(FRAG_SHARED, flags)) {
insert_shared_get_dcontext(dcontext, ilist, instr, true/*save_xdi*/);
PRE(ilist, instr, INSTR_CREATE_mov_st
(dcontext,
opnd_create_dcontext_field_via_reg_sz(dcontext, REG_NULL/*default*/,
EXIT_REASON_OFFSET, OPSZ_4),
OPND_CREATE_INT32(reason)));
insert_shared_restore_dcontext_reg(dcontext, ilist, instr);
} else {
PRE(ilist, instr,
instr_create_save_immed_to_dcontext(dcontext, reason,
EXIT_REASON_OFFSET));
}
}
}
# endif
# ifdef STEAL_REGISTER
/* in linux, system calls get their parameters via registers.
* edi is the last one used, but there are system calls that
* use it, so we put the real value into edi. plus things
* like fork() should get the real register values.
* it's also a good idea to put the real edi into %edi for
* debugger interrupts (int3).
*/
/* the only way we can save and then restore our dc
* ptr is to use the stack!
* this should be fine, all interrupt instructions push
* both eflags and return address on stack, so esp must
* be valid at this point. there could be an application
* assuming only 2 slots on stack will be used, we use a 3rd
* slot, could mess up that app...but what can we do?
* also, if kernel examines user stack, we could have problems.
* push edi # push dcontext ptr
* restore edi # restore app edi
* <syscall>
* push ebx
* mov edi, ebx
* mov 4(esp), edi # get dcontext ptr
* save ebx to edi slot
* pop ebx
* add 4,esp # clean up push of dcontext ptr
*/
IF_X64(ASSERT_NOT_IMPLEMENTED(false));
PRE(ilist, instr,
INSTR_CREATE_push(dcontext, opnd_create_reg(REG_EDI)));
PRE(ilist, instr,
instr_create_restore_from_dcontext(dcontext, REG_EDI, XDI_OFFSET));
/* insert after in reverse order: */
POST(ilist, instr,
INSTR_CREATE_add(dcontext, opnd_create_reg(REG_ESP),
OPND_CREATE_INT8(4)));
POST(ilist, instr,
INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_EBX)));
POST(ilist, instr,
instr_create_save_to_dcontext(dcontext, REG_EBX, XDI_OFFSET));
POST(ilist, instr,
INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_EDI),
OPND_CREATE_MEM32(REG_ESP, 4)));
POST(ilist, instr,
INSTR_CREATE_mov_ld(dcontext, opnd_create_reg(REG_EBX),
opnd_create_reg(REG_EDI)));
POST(ilist, instr,
INSTR_CREATE_push(dcontext, opnd_create_reg(REG_EBX)));
# endif /* STEAL_REGISTER */
#else /* WINDOWS */
/* special handling of system calls is performed in shared_syscall or
* in do_syscall
*/
/* FIXME: for ignorable syscalls,
* do we need support for exiting mid-fragment prior to a syscall
* like we do on Linux, to bound time in cache?
*/
if (does_syscall_ret_to_callsite()) {
uint len = instr_length(dcontext, instr);
if (TEST(INSTR_SHARED_SYSCALL, instr->flags)) {
ASSERT(DYNAMO_OPTION(shared_syscalls));
/* this syscall will be performed by the shared_syscall code
* we just need to place a return address into the dcontext
* xsi slot or the mangle-next-tag tls slot
*/
if (DYNAMO_OPTION(shared_fragment_shared_syscalls)) {
# ifdef X64
ASSERT(instr_raw_bits_valid(instr));
/* PR 244741: no 64-bit store-immed-to-mem
* FIXME: would be nice to move this to the stub and
* use the dead rbx register!
*/
PRE(ilist, instr,
instr_create_save_to_tls(dcontext, REG_XCX, MANGLE_NEXT_TAG_SLOT));
PRE(ilist, instr,
INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XCX),
OPND_CREATE_INTPTR((instr->bytes + len))));
PRE(ilist, instr, INSTR_CREATE_xchg
(dcontext, opnd_create_tls_slot(os_tls_offset(MANGLE_NEXT_TAG_SLOT)),
opnd_create_reg(REG_XCX)));
# else
PRE(ilist, instr, INSTR_CREATE_mov_st
(dcontext, opnd_create_tls_slot(os_tls_offset(MANGLE_NEXT_TAG_SLOT)),
OPND_CREATE_INTPTR((instr->bytes + len))));
# endif
}
else {
PRE(ilist, instr, instr_create_save_immed_to_dcontext
(dcontext, (uint)(ptr_uint_t)(instr->bytes + len), XSI_OFFSET));
}
}
/* Handle ignorable syscall. non-ignorable system calls are
* destroyed and removed from the list at the end of this func.
*/
else if (!TEST(INSTR_NI_SYSCALL, instr->flags)) {
if (get_syscall_method() == SYSCALL_METHOD_INT &&
DYNAMO_OPTION(sygate_int)) {
/* for Sygate need to mangle into a call to int_syscall_addr
* is anyone going to get screwed up by this change
* (say flags change?) [-ignore_syscalls only]*/
ASSERT_NOT_TESTED();
instrlist_replace(ilist, instr, create_syscall_instr(dcontext));
instr_destroy(dcontext, instr);
} else if (get_syscall_method() == SYSCALL_METHOD_SYSCALL)
ASSERT_NOT_TESTED();
else if (get_syscall_method() == SYSCALL_METHOD_WOW64)
ASSERT_NOT_TESTED();
return;
}
} else if (get_syscall_method() == SYSCALL_METHOD_SYSENTER) {
/* on XP/2003 we have a choice between inserting a trampoline at the
* return pt of the sysenter, which is 0x7ffe0304 (except for
* SP2-patched XP), which is bad since it would clobber whatever's after
* the ret there (unless we used a 0xcc, like Visual Studio 2005 debugger
* does), or replacing the ret addr on the stack -- we choose the
* latter as the lesser of two transparency evils. Note that the
* page at 0x7ffe0000 can't be made writable anyway, so hooking
* isn't possible.
*/
if (TEST(INSTR_SHARED_SYSCALL, instr->flags)) {
ASSERT(DYNAMO_OPTION(shared_syscalls));
}
/* Handle ignorable syscall. non-ignorable system calls are
* destroyed and removed from the list at the end of this func.
*/
else if (!TEST(INSTR_NI_SYSCALL, instr->flags)) {
instr_t *mov_imm;
/* even w/ ignorable syscall, need to make sure regain control */
ASSERT(next_instr != NULL);
ASSERT(DYNAMO_OPTION(indcall2direct));
/* for sygate hack need to basically duplicate what is done in
* shared_syscall, but here we could be shared so would need to
* grab dcontext first etc. */
ASSERT_NOT_IMPLEMENTED(!DYNAMO_OPTION(sygate_sysenter));
/* PR 253943: we don't support sysenter in x64 */
IF_X64(ASSERT_NOT_IMPLEMENTED(false)); /* can't have 8-byte imm-to-mem */
/* FIXME PR 303413: we won't properly translate a fault in our
* app stack reference here. It's marked as our own mangling
* so we'll at least return failure from our translate routine.
*/
mov_imm = INSTR_CREATE_mov_st(dcontext, OPND_CREATE_MEM32(REG_XSP, 0),
opnd_create_instr(next_instr));
ASSERT(instr_is_mov_imm_to_tos(mov_imm));
PRE(ilist, instr, mov_imm);
/* do not let any encoding for length be cached!
* o/w will lose pc-relative opnd
*/
/* 'next_instr' is executed after the after-syscall vsyscall
* 'ret', which is executed natively. */
instr_set_meta(instr_get_prev(instr));
return; /* leave syscall instr alone */
}
} else {
SYSLOG_INTERNAL_ERROR("unsupported system call method");
LOG(THREAD, LOG_INTERP, 1, "don't know convention for this syscall method\n");
if (!TEST(INSTR_NI_SYSCALL, instr->flags))
return;
ASSERT_NOT_IMPLEMENTED(false);
}
/* destroy the syscall instruction */
instrlist_remove(ilist, instr);
instr_destroy(dcontext, instr);
#endif /* WINDOWS */
}
#ifdef UNIX
/* Makes sure the jmp immediately after the syscall instruction
* either skips or doesn't skip the clone code following it,
* as indicated by the parameter skip.
* pc must be either the return address of pre_system_call or
* the address of do_syscall.
*/
void
mangle_clone_code(dcontext_t *dcontext, byte *pc, bool skip)
{
byte *target, *prev_pc;
instr_t instr;
instr_init(dcontext, &instr);
LOG(THREAD, LOG_SYSCALLS, 3,
"mangle_clone_code: pc="PFX", skip=%d\n", pc, skip);
do {
instr_reset(dcontext, &instr);
pc = decode(dcontext, pc, &instr);
ASSERT(pc != NULL); /* our own code! */
} while (!instr_is_syscall(&instr));
/* jmp is right after syscall */
instr_reset(dcontext, &instr);
prev_pc = pc;
pc = decode(dcontext, pc, &instr);
ASSERT(pc != NULL); /* our own code! */
ASSERT(instr_get_opcode(&instr) == OP_jmp);
if (skip) {
/* target is after 3rd xchg */
instr_t tmp_instr;
int num_xchg = 0;
target = pc;
instr_init(dcontext, &tmp_instr);
while (num_xchg <= 2) {
instr_reset(dcontext, &tmp_instr);
target = decode(dcontext, target, &tmp_instr);
ASSERT(target != NULL); /* our own code! */
if (instr_get_opcode(&tmp_instr) == OP_xchg)
num_xchg++;
}
instr_free(dcontext, &tmp_instr);
} else {
target = pc;
}
if (opnd_get_pc(instr_get_target(&instr)) != target) {
DEBUG_DECLARE(byte *nxt_pc;)
LOG(THREAD, LOG_SYSCALLS, 3,
"\tmodifying target of after-clone jmp to "PFX"\n", target);
instr_set_target(&instr, opnd_create_pc(target));
DEBUG_DECLARE(nxt_pc = ) instr_encode(dcontext, &instr, prev_pc);
ASSERT(nxt_pc != NULL && nxt_pc == pc);
} else {
LOG(THREAD, LOG_SYSCALLS, 3,
"\ttarget of after-clone jmp is already "PFX"\n", target);
}
instr_free(dcontext, &instr);
}
/* If skip is false:
* changes the jmp right before the next syscall (after pc) to target the
* exit cti immediately following it;
* If skip is true:
* changes back to the default, where skip hops over the exit cti,
* which is assumed to be located at pc.
*/
bool
mangle_syscall_code(dcontext_t *dcontext, fragment_t *f, byte *pc, bool skip)
{
byte *stop_pc = fragment_body_end_pc(dcontext, f);
byte *target, *prev_pc, *cti_pc = NULL, *skip_pc = NULL;
instr_t instr;
DEBUG_DECLARE(instr_t cti;)
instr_init(dcontext, &instr);
DODEBUG({ instr_init(dcontext, &cti); });
LOG(THREAD, LOG_SYSCALLS, 3,
"mangle_syscall_code: pc="PFX", skip=%d\n", pc, skip);
do<