blob: 0150ef2a962db3e55343dd44c45950b5f5e29439 [file] [log] [blame]
/* **********************************************************
* Copyright (c) 2014-2021 Google, Inc. All rights reserved.
* Copyright (c) 2016 ARM Limited. All rights reserved.
* **********************************************************/
/*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of ARM Limited nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL ARM LIMITED OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
#include "../globals.h"
#include "arch.h"
#include "instr.h"
#include "instr_create.h"
#include "instrlist.h"
#include "instrument.h"
/* shorten code generation lines */
#define APP instrlist_meta_append
#define PRE instrlist_meta_preinsert
#define OPREG opnd_create_reg
#define NOP_INST 0xd503201f
#define BR_X1_INST (0xd61f0000 | 1 << 5) /* br x1 */
/***************************************************************************/
/* EXIT STUB */
/***************************************************************************/
/* We use multiple approaches to linking based on how far away the target
* fragment is:
*
* Unlinked:
* exit_cti stub
* ...
* stub:
* stp x0, x1, [x28]
* movz x0, #&linkstub[0, 16), lsl #0x00
* movk x0, #&linkstub[16, 32), lsl #0x10
* movk x0, #&linkstub[32, 48), lsl #0x20
* movk x0, #&linkstub[48, 64), lsl #0x30
* ldr x1, [#8/#12]
* br x1
* <fcache-return>
*
* Linked, exit_cti_reaches_target (near fragment):
* exit_cti target_fragment
* ...
* stub:
* stp x0, x1, [x28]
* movz x0, #&linkstub[0, 16), lsl #0x00
* movk x0, #&linkstub[16, 32), lsl #0x10
* movk x0, #&linkstub[32, 48), lsl #0x20
* movk x0, #&linkstub[48, 64), lsl #0x30
* ldr x1, [#8/#12]
* br x1
* <fcache-return>
*
* Linked, unconditional branch reaches target (intermediate fragment):
* exit_cti stub
* ...
* stub:
* b target_fragment
* movz x0, #&linkstub[0, 16), lsl #0x00
* movk x0, #&linkstub[16, 32), lsl #0x10
* movk x0, #&linkstub[32, 48), lsl #0x20
* movk x0, #&linkstub[48, 64), lsl #0x30
* ldr x1, [#8/#12]
* br x1
* <fcache-return>
*
* Linked, !unconditional branch reaches target (far fragment):
* exit_cti stub
* ...
* stub:
* stp x0, x1, [x28]
* movz x0, #&linkstub[0, 16), lsl #0x00
* movk x0, #&linkstub[16, 32), lsl #0x10
* movk x0, #&linkstub[32, 48), lsl #0x20
* movk x0, #&linkstub[48, 64), lsl #0x30
* ldr x1, [#8/#12]
* br x1
* <target_fragment_prefix>
*
* To ensure atomicity of <target> patching, the data slot must be 8-byte
* aligned. We do this by reserving 12 bytes for the data slot and using the
* appropriate offset in ldr for the 8-byte aligned 8 byte region within it.
*
* For complete design details, see the following wiki
* https://dynamorio.org/page_aarch64_far.html
*/
byte *
insert_relative_target(byte *pc, cache_pc target, bool hot_patch)
{
ASSERT_NOT_IMPLEMENTED(false); /* FIXME i#1569 */
return NULL;
}
byte *
insert_relative_jump(byte *pc, cache_pc target, bool hot_patch)
{
ASSERT_NOT_IMPLEMENTED(false); /* FIXME i#1569 */
return NULL;
}
uint
nop_pad_ilist(dcontext_t *dcontext, fragment_t *f, instrlist_t *ilist, bool emitting)
{
ASSERT_NOT_IMPLEMENTED(false); /* FIXME i#1569 */
return 0;
}
size_t
get_fcache_return_tls_offs(dcontext_t *dcontext, uint flags)
{
/* AArch64 always uses shared gencode so we ignore FRAG_DB_SHARED(flags) */
if (TEST(FRAG_COARSE_GRAIN, flags)) {
/* FIXME i#1575: coarse-grain NYI on AArch64 */
ASSERT_NOT_IMPLEMENTED(false);
return 0;
}
return TLS_FCACHE_RETURN_SLOT;
}
/* Generate move (immediate) of a 64-bit value using at most 4 instructions.
* pc must be a writable (vmcode) pc.
*/
static uint *
insert_mov_imm(uint *pc, reg_id_t dst, ptr_int_t val)
{
uint rt = dst - DR_REG_X0;
ASSERT(rt < 31);
*pc++ = 0xd2800000 | rt | (val & 0xffff) << 5; /* mov x(rt), #x */
if ((val >> 16 & 0xffff) != 0)
*pc++ = 0xf2a00000 | rt | (val >> 16 & 0xffff) << 5; /* movk x(rt), #x, lsl #16 */
if ((val >> 32 & 0xffff) != 0)
*pc++ = 0xf2c00000 | rt | (val >> 32 & 0xffff) << 5; /* movk x(rt), #x, lsl #32 */
if ((val >> 48 & 0xffff) != 0)
*pc++ = 0xf2e00000 | rt | (val >> 48 & 0xffff) << 5; /* movk x(rt), #x, lsl #48 */
return pc;
}
/* Returns addr for the target_pc data slot of the given stub. The slot starts at the
* 8-byte aligned region in the 12-byte slot reserved in the stub.
*/
static ptr_uint_t *
get_target_pc_slot(fragment_t *f, cache_pc stub_pc)
{
return (ptr_uint_t *)ALIGN_FORWARD(
vmcode_get_writable_addr(stub_pc + DIRECT_EXIT_STUB_SIZE(f->flags) -
DIRECT_EXIT_STUB_DATA_SZ),
8);
}
/* Emit code for the exit stub at stub_pc. Return the size of the
* emitted code in bytes. This routine assumes that the caller will
* take care of any cache synchronization necessary.
* The stub is unlinked initially, except coarse grain indirect exits,
* which are always linked.
*/
int
insert_exit_stub_other_flags(dcontext_t *dcontext, fragment_t *f, linkstub_t *l,
cache_pc stub_pc, ushort l_flags)
{
uint *write_stub_pc = (uint *)vmcode_get_writable_addr(stub_pc);
uint *pc = write_stub_pc;
uint num_nops_needed = 0;
/* FIXME i#1575: coarse-grain NYI on ARM */
ASSERT_NOT_IMPLEMENTED(!TEST(FRAG_COARSE_GRAIN, f->flags));
if (LINKSTUB_DIRECT(l_flags)) {
/* stp x0, x1, [x(stolen), #(offs)] */
*pc++ = (0xa9000000 | 0 | 1 << 10 | (dr_reg_stolen - DR_REG_X0) << 5 |
TLS_REG0_SLOT >> 3 << 15);
/* mov x0, ... */
pc = insert_mov_imm(pc, DR_REG_X0, (ptr_int_t)l);
num_nops_needed = 4 - (pc - write_stub_pc - 1);
ptr_uint_t *target_pc_slot = get_target_pc_slot(f, stub_pc);
ASSERT(pc < (uint *)target_pc_slot);
uint target_pc_slot_offs = (uint *)target_pc_slot - pc;
/* ldr x1, [pc, target_pc_slot_offs * AARCH64_INSTR_SIZE] */
*pc++ = (0x58000000 | (DR_REG_X1 - DR_REG_X0) | target_pc_slot_offs << 5);
/* br x1 */
*pc++ = BR_X1_INST;
/* Fill up with NOPs, depending on how many instructions we needed to move
* the immediate into a register. Ideally we would skip adding NOPs, but
* lots of places expect the stub size to be fixed.
*/
for (uint j = 0; j < num_nops_needed; j++)
*pc++ = NOP_INST;
/* The final slot is a data slot, which will hold the address of either
* the fcache-return routine or the linked fragment. We reserve 12 bytes
* and use the 8-byte aligned region of 8 bytes within it.
*/
ASSERT(pc == (uint *)target_pc_slot || pc + 1 == (uint *)target_pc_slot);
ASSERT(sizeof(app_pc) == 8);
pc += DIRECT_EXIT_STUB_DATA_SZ / sizeof(uint);
/* We start off with the fcache-return routine address in the slot. */
/* AArch64 uses shared gencode. So, fcache_return routine address should be
* same, no matter which thread creates/unpatches the stub.
*/
ASSERT(fcache_return_routine(dcontext) == fcache_return_routine(GLOBAL_DCONTEXT));
*target_pc_slot = (ptr_uint_t)fcache_return_routine(dcontext);
ASSERT((ptr_int_t)((byte *)pc - (byte *)write_stub_pc) ==
DIRECT_EXIT_STUB_SIZE(l_flags));
} else {
/* Stub starts out unlinked. */
cache_pc exit_target =
get_unlinked_entry(dcontext, EXIT_TARGET_TAG(dcontext, f, l));
/* stp x0, x1, [x(stolen), #(offs)] */
*pc++ = (0xa9000000 | 0 | 1 << 10 | (dr_reg_stolen - DR_REG_X0) << 5 |
TLS_REG0_SLOT >> 3 << 15);
/* mov x0, ... */
pc = insert_mov_imm(pc, DR_REG_X0, (ptr_int_t)l);
num_nops_needed = 4 - (pc - write_stub_pc - 1);
/* ldr x1, [x(stolen), #(offs)] */
*pc++ = (0xf9400000 | 1 | (dr_reg_stolen - DR_REG_X0) << 5 |
get_ibl_entry_tls_offs(dcontext, exit_target) >> 3 << 10);
/* br x1 */
*pc++ = BR_X1_INST;
/* Fill up with NOPs, depending on how many instructions we needed to move
* the immediate into a register. Ideally we would skip adding NOPs, but
* lots of places expect the stub size to be fixed.
*/
for (uint j = 0; j < num_nops_needed; j++)
*pc++ = NOP_INST;
}
return (int)((byte *)pc - (byte *)write_stub_pc);
}
bool
exit_cti_reaches_target(dcontext_t *dcontext, fragment_t *f, linkstub_t *l,
cache_pc target_pc)
{
cache_pc branch_pc = EXIT_CTI_PC(f, l);
/* Compute offset as unsigned, modulo arithmetic. */
ptr_uint_t off = (ptr_uint_t)target_pc - (ptr_uint_t)branch_pc;
uint enc = *(uint *)branch_pc;
ASSERT(ALIGNED(branch_pc, 4) && ALIGNED(target_pc, 4));
if ((enc & 0xfc000000) == 0x14000000) /* B (OP_b)*/
return (off + 0x8000000 < 0x10000000);
else if ((enc & 0xff000010) == 0x54000000 ||
(enc & 0x7e000000) == 0x34000000) /* B.cond, CBNZ, CBZ */
return (off + 0x40000 < 0x80000);
else if ((enc & 0x7e000000) == 0x36000000) /* TBNZ, TBZ */
return (off + 0x2000 < 0x4000);
ASSERT(false);
return false;
}
void
patch_stub(fragment_t *f, cache_pc stub_pc, cache_pc target_pc, cache_pc target_prefix_pc,
bool hot_patch)
{
/* Compute offset as unsigned, modulo arithmetic. */
ptr_uint_t off = (ptr_uint_t)target_pc - (ptr_uint_t)stub_pc;
if (off + 0x8000000 < 0x10000000) {
/* target_pc is a near fragment. We can get there with a B
* (OP_b, 26-bit signed immediate offset).
* i#1911: Patching arbitrary instructions to an unconditional branch
* is theoretically not sound. Architectural specifications do not
* guarantee safe behaviour or any bound on when the change will be
* visible to other processor elements.
*/
*(uint *)vmcode_get_writable_addr(stub_pc) =
(0x14000000 | (0x03ffffff & off >> 2));
if (hot_patch)
machine_cache_sync(stub_pc, stub_pc + 4, true);
return;
}
/* target_pc is a far fragment. We must use an indirect branch. Note that the indirect
* branch needs to be to the fragment prefix, as we need to restore the clobbered
* regs.
*/
/* We set hot_patch to false as we are not modifying code. */
ATOMIC_8BYTE_ALIGNED_WRITE(get_target_pc_slot(f, stub_pc),
(ptr_uint_t)target_prefix_pc,
/*hot_patch=*/false);
return;
}
static bool
stub_is_patched_for_intermediate_fragment_link(dcontext_t *dcontext, cache_pc stub_pc)
{
uint enc;
ATOMIC_4BYTE_ALIGNED_READ(stub_pc, &enc);
return (enc & 0xfc000000) == 0x14000000; /* B (OP_b)*/
}
static bool
stub_is_patched_for_far_fragment_link(dcontext_t *dcontext, fragment_t *f,
cache_pc stub_pc)
{
ptr_uint_t target_pc;
ATOMIC_8BYTE_ALIGNED_READ(get_target_pc_slot(f, stub_pc), &target_pc);
return target_pc != (ptr_uint_t)fcache_return_routine(dcontext);
}
bool
stub_is_patched(dcontext_t *dcontext, fragment_t *f, cache_pc stub_pc)
{
return stub_is_patched_for_intermediate_fragment_link(dcontext, stub_pc) ||
stub_is_patched_for_far_fragment_link(dcontext, f, stub_pc);
}
void
unpatch_stub(dcontext_t *dcontext, fragment_t *f, cache_pc stub_pc, bool hot_patch)
{
/* At any time, at most one patching strategy will be in effect: the one for
* intermediate fragments or the one for far fragments.
*/
if (stub_is_patched_for_intermediate_fragment_link(dcontext, stub_pc)) {
/* Restore the stp x0, x1, [x(stolen), #(offs)]
* i#1911: Patching unconditional branch to some arbitrary instruction
* is theoretically not sound. Architectural specifications do not
* guarantee safe behaviour or any bound on when the change will be
* visible to other processor elements.
*/
*(uint *)vmcode_get_writable_addr(stub_pc) =
(0xa9000000 | 0 | 1 << 10 | (dr_reg_stolen - DR_REG_X0) << 5 |
TLS_REG0_SLOT >> 3 << 15);
if (hot_patch)
machine_cache_sync(stub_pc, stub_pc + AARCH64_INSTR_SIZE, true);
} else if (stub_is_patched_for_far_fragment_link(dcontext, f, stub_pc)) {
/* Restore the data slot to fcache return address. */
/* AArch64 uses shared gencode. So, fcache_return routine address should be
* same, no matter which thread creates/unpatches the stub.
*/
ASSERT(fcache_return_routine(dcontext) == fcache_return_routine(GLOBAL_DCONTEXT));
/* We set hot_patch to false as we are not modifying code. */
ATOMIC_8BYTE_ALIGNED_WRITE(get_target_pc_slot(f, stub_pc),
(ptr_uint_t)fcache_return_routine(dcontext),
/*hot_patch=*/false);
}
}
void
patch_branch(dr_isa_mode_t isa_mode, cache_pc branch_pc, cache_pc target_pc,
bool hot_patch)
{
/* Compute offset as unsigned, modulo arithmetic. */
ptr_uint_t off = (ptr_uint_t)target_pc - (ptr_uint_t)branch_pc;
uint *pc_writable = (uint *)vmcode_get_writable_addr(branch_pc);
uint enc = *pc_writable;
ASSERT(ALIGNED(branch_pc, 4) && ALIGNED(target_pc, 4));
if ((enc & 0xfc000000) == 0x14000000) { /* B */
ASSERT(off + 0x8000000 < 0x10000000);
*pc_writable = (0x14000000 | (0x03ffffff & off >> 2));
} else if ((enc & 0xff000010) == 0x54000000 ||
(enc & 0x7e000000) == 0x34000000) { /* B.cond, CBNZ, CBZ */
ASSERT(off + 0x40000 < 0x80000);
*pc_writable = (enc & 0xff00001f) | (0x00ffffe0 & off >> 2 << 5);
} else if ((enc & 0x7e000000) == 0x36000000) { /* TBNZ, TBZ */
ASSERT(off + 0x2000 < 0x4000);
*pc_writable = (enc & 0xfff8001f) | (0x0007ffe0 & off >> 2 << 5);
} else
ASSERT(false);
if (hot_patch)
machine_cache_sync(branch_pc, branch_pc + 4, true);
return;
}
uint
patchable_exit_cti_align_offs(dcontext_t *dcontext, instr_t *inst, cache_pc pc)
{
return 0; /* always aligned */
}
cache_pc
exit_cti_disp_pc(cache_pc branch_pc)
{
ASSERT_NOT_IMPLEMENTED(false); /* FIXME i#1569 */
return NULL;
}
/* Skips NOP instructions backwards until the first non-NOP instruction is found. */
static uint *
get_stub_branch(uint *pc)
{
/* Skip NOP instructions backwards. */
while (*pc == NOP_INST)
pc--;
/* The First non-NOP instruction must be the branch. */
ASSERT(*pc == BR_X1_INST);
return pc;
}
void
link_indirect_exit_arch(dcontext_t *dcontext, fragment_t *f, linkstub_t *l,
bool hot_patch, app_pc target_tag)
{
byte *stub_pc = (byte *)EXIT_STUB_PC(dcontext, f, l);
uint *pc;
cache_pc exit_target;
ibl_type_t ibl_type = { 0 };
DEBUG_DECLARE(bool is_ibl =)
get_ibl_routine_type_ex(dcontext, target_tag, &ibl_type);
ASSERT(is_ibl);
if (IS_IBL_LINKED(ibl_type.link_state))
exit_target = target_tag;
else
exit_target = get_linked_entry(dcontext, target_tag);
/* Set pc to the last instruction in the stub. */
pc = (uint *)(stub_pc + exit_stub_size(dcontext, target_tag, f->flags) -
AARCH64_INSTR_SIZE);
pc = get_stub_branch(pc) - 1;
/* ldr x1, [x(stolen), #(offs)] */
*(uint *)vmcode_get_writable_addr((byte *)pc) =
(0xf9400000 | 1 | (dr_reg_stolen - DR_REG_X0) << 5 |
get_ibl_entry_tls_offs(dcontext, exit_target) >> 3 << 10);
if (hot_patch)
machine_cache_sync(pc, pc + 1, true);
}
cache_pc
indirect_linkstub_stub_pc(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
cache_pc cti = EXIT_CTI_PC(f, l);
if (!EXIT_HAS_STUB(l->flags, f->flags))
return NULL;
ASSERT(decode_raw_is_jmp(dcontext, cti));
return decode_raw_jmp_target(dcontext, cti);
}
cache_pc
cbr_fallthrough_exit_cti(cache_pc prev_cti_pc)
{
ASSERT_NOT_IMPLEMENTED(false); /* FIXME i#1569 */
return NULL;
}
void
unlink_indirect_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
byte *stub_pc = (byte *)EXIT_STUB_PC(dcontext, f, l);
uint *pc;
cache_pc exit_target;
ibl_code_t *ibl_code = NULL;
ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
ASSERT(LINKSTUB_INDIRECT(l->flags));
/* Target is always the same, so if it's already unlinked, this is a nop. */
if (!TEST(LINK_LINKED, l->flags))
return;
ibl_code = get_ibl_routine_code(dcontext, extract_branchtype(l->flags), f->flags);
exit_target = ibl_code->unlinked_ibl_entry;
/* Set pc to the last instruction in the stub. */
pc = (uint *)(stub_pc +
exit_stub_size(dcontext, ibl_code->indirect_branch_lookup_routine,
f->flags) -
AARCH64_INSTR_SIZE);
pc = get_stub_branch(pc) - 1;
/* ldr x1, [x(stolen), #(offs)] */
*(uint *)vmcode_get_writable_addr((byte *)pc) =
(0xf9400000 | 1 | (dr_reg_stolen - DR_REG_X0) << 5 |
get_ibl_entry_tls_offs(dcontext, exit_target) >> 3 << 10);
machine_cache_sync(pc, pc + 1, true);
}
/*******************************************************************************
* COARSE-GRAIN FRAGMENT SUPPORT
*/
cache_pc
entrance_stub_jmp(cache_pc stub)
{
ASSERT_NOT_IMPLEMENTED(false); /* FIXME i#1569 */
return NULL;
}
bool
coarse_is_entrance_stub(cache_pc stub)
{
/* FIXME i#1575: coarse-grain NYI on AArch64 */
return false;
}
/*###########################################################################
*
* fragment_t Prefixes
*/
int
fragment_ibt_prefix_size(uint flags)
{
/* Nothing extra for ibt as we don't have flags to restore */
return FRAGMENT_BASE_PREFIX_SIZE(flags);
}
void
insert_fragment_prefix(dcontext_t *dcontext, fragment_t *f)
{
/* Always use prefix on AArch64 as there is no load to PC. */
byte *write_start = vmcode_get_writable_addr(f->start_pc);
byte *pc = write_start;
ASSERT(f->prefix_size == 0);
/* ldp x0, x1, [x(stolen), #(off)] */
*(uint *)pc = (0xa9400000 | (DR_REG_X0 - DR_REG_X0) | (DR_REG_X1 - DR_REG_X0) << 10 |
(dr_reg_stolen - DR_REG_X0) << 5 | TLS_REG0_SLOT >> 3 << 10);
pc += 4;
f->prefix_size = (byte)(((cache_pc)pc) - write_start);
ASSERT(f->prefix_size == fragment_prefix_size(f->flags));
}
/***************************************************************************/
/* THREAD-PRIVATE/SHARED ROUTINE GENERATION */
/***************************************************************************/
void
append_call_exit_dr_hook(dcontext_t *dcontext, instrlist_t *ilist, bool absolute,
bool shared)
{
/* i#1569: DR_HOOK is not supported on AArch64 */
ASSERT_NOT_IMPLEMENTED(EXIT_DR_HOOK == NULL);
}
void
append_restore_xflags(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
APP(ilist, RESTORE_FROM_DC(dcontext, DR_REG_W0, XFLAGS_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, DR_REG_W1, XFLAGS_OFFSET + 4));
APP(ilist, RESTORE_FROM_DC(dcontext, DR_REG_W2, XFLAGS_OFFSET + 8));
APP(ilist,
INSTR_CREATE_msr(dcontext, opnd_create_reg(DR_REG_NZCV),
opnd_create_reg(DR_REG_X0)));
APP(ilist,
INSTR_CREATE_msr(dcontext, opnd_create_reg(DR_REG_FPCR),
opnd_create_reg(DR_REG_X1)));
APP(ilist,
INSTR_CREATE_msr(dcontext, opnd_create_reg(DR_REG_FPSR),
opnd_create_reg(DR_REG_X2)));
}
/* dcontext is in REG_DCXT; other registers can be used as scratch.
*/
void
append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
int i;
/* add x1, x(dcxt), #(off) */
APP(ilist,
XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X1),
opnd_create_reg(REG_DCXT),
OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, simd))));
for (i = 0; i < 32; i += 2) {
/* ldp q(i), q(i + 1), [x1, #(i * 16)] */
APP(ilist,
INSTR_CREATE_ldp(
dcontext, opnd_create_reg(DR_REG_Q0 + i),
opnd_create_reg(DR_REG_Q0 + i + 1),
opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, i * 16, OPSZ_32)));
}
}
/* Append instructions to restore gpr on fcache enter, to be executed
* right before jump to fcache target.
* - dcontext is in REG_DCXT
* - DR's tls base is in dr_reg_stolen
* - all other registers can be used as scratch, and we are using X0.
*/
void
append_restore_gpr(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
int i;
/* FIXME i#1573: NYI on ARM with SELFPROT_DCONTEXT */
ASSERT_NOT_IMPLEMENTED(!TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask));
ASSERT(dr_reg_stolen != SCRATCH_REG0);
/* Store stolen reg value into TLS slot. */
APP(ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG0, REG_OFFSET(dr_reg_stolen)));
APP(ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG0, TLS_REG_STOLEN_SLOT));
/* Save DR's tls base into mcontext so we can blindly include it in the
* loop of OP_ldp instructions below.
* This means that the mcontext stolen reg slot holds DR's base instead of
* the app's value while we're in the cache, which can be confusing: but we have
* to get the official value from TLS on signal and other transitions anyway,
* and DR's base makes it easier to spot bugs than a prior app value.
*/
APP(ilist, SAVE_TO_DC(dcontext, dr_reg_stolen, REG_OFFSET(dr_reg_stolen)));
i = (REG_DCXT == DR_REG_X0);
/* ldp x30, x(i), [x(dcxt), #x30_offset] */
APP(ilist,
INSTR_CREATE_ldp(dcontext, opnd_create_reg(DR_REG_X30),
opnd_create_reg(DR_REG_X0 + i),
opnd_create_base_disp(REG_DCXT, DR_REG_NULL, 0,
REG_OFFSET(DR_REG_X30), OPSZ_16)));
/* mov sp, x(i) */
APP(ilist,
XINST_CREATE_move(dcontext, opnd_create_reg(DR_REG_SP),
opnd_create_reg(DR_REG_X0 + i)));
for (i = 0; i < 30; i += 2) {
if ((REG_DCXT - DR_REG_X0) >> 1 != i >> 1) {
/* ldp x(i), x(i+1), [x(dcxt), #xi_offset] */
APP(ilist,
INSTR_CREATE_ldp(dcontext, opnd_create_reg(DR_REG_X0 + i),
opnd_create_reg(DR_REG_X0 + i + 1),
opnd_create_base_disp(REG_DCXT, DR_REG_NULL, 0,
REG_OFFSET(DR_REG_X0 + i),
OPSZ_16)));
}
}
i = (REG_DCXT - DR_REG_X0) & ~1;
/* ldp x(i), x(i+1), [x(dcxt), #xi_offset] */
APP(ilist,
INSTR_CREATE_ldp(dcontext, opnd_create_reg(DR_REG_X0 + i),
opnd_create_reg(DR_REG_X0 + i + 1),
opnd_create_base_disp(REG_DCXT, DR_REG_NULL, 0,
REG_OFFSET(DR_REG_X0 + i), OPSZ_16)));
}
/* Append instructions to save gpr on fcache return, called after
* append_fcache_return_prologue.
* Assuming the execution comes from an exit stub via br DR_REG_X1,
* dcontext base is held in REG_DCXT, and exit stub in X0.
* App's x0 and x1 is stored in TLS_REG0_SLOT and TLS_REG1_SLOT
* - store all registers into dcontext's mcontext
* - restore REG_DCXT app value from TLS slot to mcontext
* - restore dr_reg_stolen app value from TLS slot to mcontext
*/
void
append_save_gpr(dcontext_t *dcontext, instrlist_t *ilist, bool ibl_end, bool absolute,
generated_code_t *code, linkstub_t *linkstub, bool coarse_info)
{
int i;
/* X0 and X1 will always have been saved in TLS slots before executing
* the code generated here. See, for example:
* emit_do_syscall_common, emit_indirect_branch_lookup, handle_sigreturn,
* insert_exit_stub_other_flags, execute_handler_from_{cache,dispatch},
* transfer_from_sig_handler_to_fcache_return
*/
for (i = 2; i < 30; i += 2) {
/* stp x(i), x(i+1), [x(dcxt), #xi_offset] */
APP(ilist,
INSTR_CREATE_stp(dcontext,
opnd_create_base_disp(REG_DCXT, DR_REG_NULL, 0,
REG_OFFSET(DR_REG_X0 + i), OPSZ_16),
opnd_create_reg(DR_REG_X0 + i),
opnd_create_reg(DR_REG_X0 + i + 1)));
}
/* mov x1, sp */
APP(ilist,
XINST_CREATE_move(dcontext, opnd_create_reg(DR_REG_X1),
opnd_create_reg(DR_REG_SP)));
/* stp x30, x1, [x(dcxt), #x30_offset] */
APP(ilist,
INSTR_CREATE_stp(dcontext,
opnd_create_base_disp(REG_DCXT, DR_REG_NULL, 0,
REG_OFFSET(DR_REG_X30), OPSZ_16),
opnd_create_reg(DR_REG_X30), opnd_create_reg(DR_REG_X1)));
/* ldp x1, x2, [x(stolen)]
* stp x1, x2, [x(dcxt)]
*/
APP(ilist,
INSTR_CREATE_ldp(
dcontext, opnd_create_reg(DR_REG_X1), opnd_create_reg(DR_REG_X2),
opnd_create_base_disp(dr_reg_stolen, DR_REG_NULL, 0, 0, OPSZ_16)));
APP(ilist,
INSTR_CREATE_stp(dcontext,
opnd_create_base_disp(REG_DCXT, DR_REG_NULL, 0, 0, OPSZ_16),
opnd_create_reg(DR_REG_X1), opnd_create_reg(DR_REG_X2)));
if (linkstub != NULL) {
/* FIXME i#1575: NYI for coarse-grain stub */
ASSERT_NOT_IMPLEMENTED(false);
}
/* REG_DCXT's app value is stored in DCONTEXT_BASE_SPILL_SLOT by
* append_prepare_fcache_return, so copy it to mcontext.
*/
APP(ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG1, DCONTEXT_BASE_SPILL_SLOT));
APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG1, REG_DCXT_OFFS));
/* dr_reg_stolen's app value is always stored in the TLS spill slot,
* and we restore its value back to mcontext on fcache return.
*/
APP(ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG1, TLS_REG_STOLEN_SLOT));
APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG1, REG_OFFSET(dr_reg_stolen)));
}
/* dcontext base is held in REG_DCXT, and exit stub in X0.
* GPR's are already saved.
*/
void
append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
int i;
/* add x1, x(DCXT), #(off) */
APP(ilist,
XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X1),
opnd_create_reg(REG_DCXT),
OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, simd))));
for (i = 0; i < 32; i += 2) {
/* stp q(i), q(i + 1), [x1, #(i * 16)] */
APP(ilist,
INSTR_CREATE_stp(
dcontext,
opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, i * 16, OPSZ_32),
opnd_create_reg(DR_REG_Q0 + i), opnd_create_reg(DR_REG_Q0 + i + 1)));
}
}
/* Scratch reg0 is holding exit stub. */
void
append_save_clear_xflags(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
APP(ilist,
INSTR_CREATE_mrs(dcontext, opnd_create_reg(DR_REG_X1),
opnd_create_reg(DR_REG_NZCV)));
APP(ilist,
INSTR_CREATE_mrs(dcontext, opnd_create_reg(DR_REG_X2),
opnd_create_reg(DR_REG_FPCR)));
APP(ilist,
INSTR_CREATE_mrs(dcontext, opnd_create_reg(DR_REG_X3),
opnd_create_reg(DR_REG_FPSR)));
APP(ilist, SAVE_TO_DC(dcontext, DR_REG_W1, XFLAGS_OFFSET));
APP(ilist, SAVE_TO_DC(dcontext, DR_REG_W2, XFLAGS_OFFSET + 4));
APP(ilist, SAVE_TO_DC(dcontext, DR_REG_W3, XFLAGS_OFFSET + 8));
}
bool
append_call_enter_dr_hook(dcontext_t *dcontext, instrlist_t *ilist, bool ibl_end,
bool absolute)
{
/* i#1569: DR_HOOK is not supported on AArch64 */
ASSERT_NOT_IMPLEMENTED(EXIT_DR_HOOK == NULL);
return false;
}
void
insert_save_eflags(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where, uint flags,
bool tls, bool absolute _IF_X86_64(bool x86_to_x64_ibl_opt))
{
ASSERT_NOT_IMPLEMENTED(false); /* FIXME i#1569 */
}
void
insert_restore_eflags(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where,
uint flags, bool tls,
bool absolute _IF_X86_64(bool x86_to_x64_ibl_opt))
{
ASSERT_NOT_IMPLEMENTED(false); /* FIXME i#1569 */
}
byte *
emit_inline_ibl_stub(dcontext_t *dcontext, byte *pc, ibl_code_t *ibl_code,
bool target_trace_table)
{
ASSERT_NOT_IMPLEMENTED(false); /* FIXME i#1569 */
return pc;
}
bool
instr_is_ibl_hit_jump(instr_t *instr)
{
return instr_get_opcode(instr) == OP_br &&
opnd_get_reg(instr_get_target(instr)) == DR_REG_X0;
}
byte *
emit_indirect_branch_lookup(dcontext_t *dc, generated_code_t *code, byte *pc,
byte *fcache_return_pc, bool target_trace_table,
bool inline_ibl_head, ibl_code_t *ibl_code /* IN/OUT */)
{
bool absolute = false;
instrlist_t ilist;
instrlist_init(&ilist);
patch_list_t *patch = &ibl_code->ibl_patch;
init_patch_list(patch, PATCH_TYPE_INDIRECT_TLS);
instr_t *load_tag = INSTR_CREATE_label(dc);
instr_t *compare_tag = INSTR_CREATE_label(dc);
instr_t *try_next = INSTR_CREATE_label(dc);
instr_t *miss = INSTR_CREATE_label(dc);
instr_t *not_hit = INSTR_CREATE_label(dc);
instr_t *target_delete_entry = INSTR_CREATE_label(dc);
instr_t *unlinked = INSTR_CREATE_label(dc);
/* FIXME i#1569: Use INSTR_CREATE macros when encoder is implemented. */
/* On entry we expect:
* x0: link_stub entry
* x1: scratch reg, arrived from br x1
* x2: indirect branch target
* TLS_REG0_SLOT: app's x0
* TLS_REG1_SLOT: app's x1
* TLS_REG2_SLOT: app's x2
* TLS_REG3_SLOT: scratch space
* There are following entries with the same context:
* indirect_branch_lookup
* unlink_stub_entry
* target_delete_entry:
* x0: scratch
* x1: table entry pointer from ibl lookup hit path
* x2: app's x2
* TLS_REG0_SLOT: app's x0
* TLS_REG1_SLOT: app's x1
* TLS_REG2_SLOT: app's x2
* On miss exit we output:
* x0: the dcontext->last_exit
* x1: br x1
* x2: app's x2
* TLS_REG0_SLOT: app's x0 (recovered by fcache_return)
* TLS_REG1_SLOT: app's x1 (recovered by fcache_return)
* On hit exit we output:
* x0: fragment_start_pc (points to the fragment prefix)
* x1: scratch reg
* x2: app's x2
* TLS_REG0_SLOT: app's x0 (recovered by fragment_prefix)
* TLS_REG1_SLOT: app's x1 (recovered by fragment_prefix)
*/
/* Spill x0. */
APP(&ilist, instr_create_save_to_tls(dc, DR_REG_R0, TLS_REG3_SLOT));
/* Load-acquire hash mask. We need a load-acquire to ensure we see updates
* properly; the corresponding store-release is in update_lookuptable_tls().
*/
/* add x1, x28 + hash_mask_offs; ldar x1, [x1] (ldar doesn't take an offs.) */
APP(&ilist,
INSTR_CREATE_add(dc, opnd_create_reg(DR_REG_X1), opnd_create_reg(dr_reg_stolen),
OPND_CREATE_INT32(TLS_MASK_SLOT(ibl_code->branch_type))));
APP(&ilist,
INSTR_CREATE_ldar(dc, opnd_create_reg(DR_REG_X1),
OPND_CREATE_MEMPTR(DR_REG_X1, 0)));
/* ldr x0, [x28, hash_table] */
APP(&ilist,
INSTR_CREATE_ldr(dc, opnd_create_reg(DR_REG_X0),
opnd_create_base_disp(dr_reg_stolen, DR_REG_NULL, 0,
TLS_TABLE_SLOT(ibl_code->branch_type),
OPSZ_8)));
/* and x1, x1, x2 */
APP(&ilist,
INSTR_CREATE_and(dc, opnd_create_reg(DR_REG_X1), opnd_create_reg(DR_REG_X1),
opnd_create_reg(DR_REG_X2)));
/* Get table entry. */
/* add x1, x0, x1, LSL #4 */
APP(&ilist,
INSTR_CREATE_add_shift(
dc, opnd_create_reg(DR_REG_X1), opnd_create_reg(DR_REG_X0),
opnd_create_reg(DR_REG_X1), OPND_CREATE_INT8(DR_SHIFT_LSL),
OPND_CREATE_INT8(4 - HASHTABLE_IBL_OFFSET(ibl_code->branch_type))));
/* x1 now holds the fragment_entry_t* in the hashtable. */
APP(&ilist, load_tag);
/* Load tag from fragment_entry_t* in the hashtable to x0. */
/* ldr x0, [x1, #tag_fragment_offset] */
APP(&ilist,
INSTR_CREATE_ldr(
dc, opnd_create_reg(DR_REG_X0),
OPND_CREATE_MEMPTR(DR_REG_X1, offsetof(fragment_entry_t, tag_fragment))));
/* Did we hit? */
APP(&ilist, compare_tag);
/* cbz x0, not_hit */
APP(&ilist,
INSTR_CREATE_cbz(dc, opnd_create_instr(not_hit), opnd_create_reg(DR_REG_X0)));
/* sub x0, x0, x2 */
APP(&ilist,
XINST_CREATE_sub(dc, opnd_create_reg(DR_REG_X0), opnd_create_reg(DR_REG_X2)));
/* cbnz x0, try_next */
APP(&ilist,
INSTR_CREATE_cbnz(dc, opnd_create_instr(try_next), opnd_create_reg(DR_REG_X0)));
/* Hit path. */
/* App's original values of x0 and x1 are already in respective TLS slots, and
* will be restored by the fragment prefix.
*/
/* Recover app's original x2. */
APP(&ilist, instr_create_restore_from_tls(dc, DR_REG_R2, TLS_REG2_SLOT));
/* ldr x0, [x1, #start_pc_fragment_offset] */
APP(&ilist,
INSTR_CREATE_ldr(dc, opnd_create_reg(DR_REG_X0),
OPND_CREATE_MEMPTR(
DR_REG_X1, offsetof(fragment_entry_t, start_pc_fragment))));
/* br x0
* (keep in sync with instr_is_ibl_hit_jump())
*/
APP(&ilist, INSTR_CREATE_br(dc, opnd_create_reg(DR_REG_X0)));
APP(&ilist, try_next);
/* Try next entry, in case of collision. No wraparound check is needed
* because of the sentinel at the end.
* ldr x0, [x1, #tag_fragment_offset]! */
APP(&ilist,
instr_create_2dst_3src(
dc, OP_ldr, opnd_create_reg(DR_REG_X0), opnd_create_reg(DR_REG_X1),
OPND_CREATE_MEMPTR(DR_REG_X1, sizeof(fragment_entry_t)),
opnd_create_reg(DR_REG_X1), OPND_CREATE_INTPTR(sizeof(fragment_entry_t))));
/* b compare_tag */
APP(&ilist, INSTR_CREATE_b(dc, opnd_create_instr(compare_tag)));
APP(&ilist, not_hit);
if (INTERNAL_OPTION(ibl_sentinel_check)) {
/* Load start_pc from fragment_entry_t* in the hashtable to x0. */
/* ldr x0, [x1, #start_pc_fragment] */
APP(&ilist,
XINST_CREATE_load(
dc, opnd_create_reg(DR_REG_X0),
OPND_CREATE_MEMPTR(DR_REG_X1,
offsetof(fragment_entry_t, start_pc_fragment))));
/* To compare with an arbitrary constant we'd need a 4th scratch reg.
* Instead we rely on the sentinel start PC being 1.
*/
ASSERT(HASHLOOKUP_SENTINEL_START_PC == (cache_pc)PTR_UINT_1);
/* sub x0, x0, #1 */
APP(&ilist,
XINST_CREATE_sub(dc, opnd_create_reg(DR_REG_X0), OPND_CREATE_INT8(1)));
/* cbnz x0, miss */
APP(&ilist,
INSTR_CREATE_cbnz(dc, opnd_create_instr(miss), opnd_create_reg(DR_REG_R0)));
/* Point at the first table slot and then go load and compare its tag */
/* ldr x1, [x28, #table_base] */
APP(&ilist,
XINST_CREATE_load(dc, opnd_create_reg(DR_REG_X1),
OPND_CREATE_MEMPTR(dr_reg_stolen,
TLS_TABLE_SLOT(ibl_code->branch_type))));
/* branch to load_tag */
APP(&ilist, INSTR_CREATE_b(dc, opnd_create_instr(load_tag)));
}
/* Target delete entry */
APP(&ilist, target_delete_entry);
add_patch_marker(patch, target_delete_entry, PATCH_ASSEMBLE_ABSOLUTE,
0 /* beginning of instruction */,
(ptr_uint_t *)&ibl_code->target_delete_entry);
/* Load next_tag from table entry. */
APP(&ilist,
INSTR_CREATE_ldr(
dc, opnd_create_reg(DR_REG_R2),
OPND_CREATE_MEMPTR(DR_REG_R1, offsetof(fragment_entry_t, tag_fragment))));
/* Store &linkstub_ibl_deleted in r0, instead of last exit linkstub by skipped code
* below.
*/
instrlist_insert_mov_immed_ptrsz(dc, (ptr_uint_t)get_ibl_deleted_linkstub(),
opnd_create_reg(DR_REG_R0), &ilist, NULL, NULL,
NULL);
APP(&ilist, INSTR_CREATE_b(dc, opnd_create_instr(unlinked)));
APP(&ilist, miss);
/* Recover the dcontext->last_exit to x0 */
APP(&ilist, instr_create_restore_from_tls(dc, DR_REG_R0, TLS_REG3_SLOT));
/* Unlink path: entry from stub */
APP(&ilist, unlinked);
add_patch_marker(patch, unlinked, PATCH_ASSEMBLE_ABSOLUTE,
0 /* beginning of instruction */,
(ptr_uint_t *)&ibl_code->unlinked_ibl_entry);
/* Put ib tgt into dcontext->next_tag */
insert_shared_get_dcontext(dc, &ilist, NULL, true);
APP(&ilist, SAVE_TO_DC(dc, DR_REG_R2, NEXT_TAG_OFFSET));
APP(&ilist, instr_create_restore_from_tls(dc, DR_REG_R5, DCONTEXT_BASE_SPILL_SLOT));
APP(&ilist, instr_create_restore_from_tls(dc, DR_REG_R2, TLS_REG2_SLOT));
/* ldr x1, [x(stolen), #(offs)] */
APP(&ilist,
INSTR_CREATE_ldr(dc, opnd_create_reg(DR_REG_X1),
OPND_TLS_FIELD(TLS_FCACHE_RETURN_SLOT)));
/* br x1 */
APP(&ilist, INSTR_CREATE_br(dc, opnd_create_reg(DR_REG_X1)));
ibl_code->ibl_routine_length = encode_with_patch_list(dc, patch, &ilist, pc);
instrlist_clear(dc, &ilist);
return pc + ibl_code->ibl_routine_length;
}
void
relink_special_ibl_xfer(dcontext_t *dcontext, int index,
ibl_entry_point_type_t entry_type, ibl_branch_type_t ibl_type)
{
generated_code_t *code;
byte *ibl_tgt;
uint *pc;
if (dcontext == GLOBAL_DCONTEXT) {
ASSERT(!special_ibl_xfer_is_thread_private()); /* else shouldn't be called */
code = SHARED_GENCODE_MATCH_THREAD(get_thread_private_dcontext());
} else {
ASSERT(special_ibl_xfer_is_thread_private()); /* else shouldn't be called */
code = THREAD_GENCODE(dcontext);
}
if (code == NULL) /* thread private that we don't need */
return;
ibl_tgt = special_ibl_xfer_tgt(dcontext, code, entry_type, ibl_type);
ASSERT(code->special_ibl_xfer[index] != NULL);
pc = (uint *)(code->special_ibl_xfer[index] + code->special_ibl_unlink_offs[index]);
uint *write_pc = (uint *)vmcode_get_writable_addr((byte *)pc);
protect_generated_code(code, WRITABLE);
/* ldr x1, [x(stolen), #(offs)] */
write_pc[0] = (0xf9400000 | 1 | (dr_reg_stolen - DR_REG_X0) << 5 |
get_ibl_entry_tls_offs(dcontext, ibl_tgt) >> 3 << 10);
/* br x1 */
write_pc[1] = 0xd61f0000 | 1 << 5;
machine_cache_sync(pc, pc + 2, true);
protect_generated_code(code, READONLY);
}
/* addr must be a writable (vmcode) address. */
bool
fill_with_nops(dr_isa_mode_t isa_mode, byte *addr, size_t size)
{
byte *pc;
if (!ALIGNED(addr, 4) || !ALIGNED(addr + size, 4)) {
ASSERT_NOT_REACHED();
return false;
}
for (pc = addr; pc < addr + size; pc += 4)
*(uint *)pc = NOP_INST; /* nop */
return true;
}