blob: 67f8f6fc02ea8acdff88663f6520f66243ab57d3 [file] [log] [blame]
/* *******************************************************************************
* Copyright (c) 2019-2020 Google, Inc. All rights reserved.
* *******************************************************************************/
/*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of Google, Inc. nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
/**************************************************************************************
* Restartable sequence ("rseq") support (i#2350).
* This is a kernel feature which provides cpu-atomic regions: if a thread
* is pre-empted within an rseq region, an abort handler is invoked.
* The feature is difficult to handle under binary instrumentation.
* We rely on the app following certain conventions, including containing a
* section holding a table of all rseq sequences.
*/
#include "../globals.h"
#include "../module_shared.h"
#include "module_private.h"
#include "os_private.h"
#include "rseq_linux.h"
#include "../fragment.h"
#include "decode.h"
#ifdef CLIENT_INTERFACE
# include "instrument.h"
#endif
#include <stddef.h>
#ifdef HAVE_RSEQ
# include <linux/rseq.h>
#else
struct rseq_cs {
uint version;
uint flags;
uint64 start_ip;
uint64 post_commit_offset;
uint64 abort_ip;
} __attribute__((aligned(4 * sizeof(uint64))));
struct rseq {
uint cpu_id_start;
uint cpu_id;
union {
uint64 ptr64;
} rseq_cs;
uint flags;
} __attribute__((aligned(4 * sizeof(uint64))));
# define RSEQ_FLAG_UNREGISTER 1
#endif
#include "include/syscall.h"
#include <errno.h>
vm_area_vector_t *d_r_rseq_areas;
DECLARE_CXTSWPROT_VAR(static mutex_t rseq_trigger_lock,
INIT_LOCK_FREE(rseq_trigger_lock));
static volatile bool rseq_enabled;
/* We require all threads to use the same TLS offset to point at struct rseq. */
static int rseq_tls_offset;
/* The signature is registered per thread, but we require all registrations
* to be the same.
*/
static int rseq_signature;
typedef struct _rseq_region_t {
app_pc start;
app_pc end;
app_pc handler;
/* We need to preserve input registers for targeting "start" instead of "handler"
* for our 2nd invocation, if they're written in the rseq region. We only support
* GPR inputs. We document that we do not support any other inputs (no flags, no
* SIMD registers).
*/
bool reg_written[DR_NUM_GPR_REGS];
} rseq_region_t;
/* We need to store a struct rseq_cs per fragment_t. To avoid the cost of adding a
* pointer field to every fragment_t, and the complexity of another subclass like
* trace_t, we store them externally in a hashtable. The FRAG_HAS_RSEQ_ENDPOINT flag
* avoids the hashtable lookup on every fragment.
*/
static generic_table_t *rseq_cs_table;
#define INIT_RSEQ_CS_TABLE_SIZE 5
/* vmvector callbacks */
static void
rseq_area_free(void *data)
{
HEAP_TYPE_FREE(GLOBAL_DCONTEXT, data, rseq_region_t, ACCT_VMAREAS, PROTECTED);
}
static void *
rseq_area_dup(void *data)
{
rseq_region_t *src = (rseq_region_t *)data;
rseq_region_t *dst =
HEAP_TYPE_ALLOC(GLOBAL_DCONTEXT, rseq_region_t, ACCT_VMAREAS, PROTECTED);
ASSERT(src != NULL);
*dst = *src;
return dst;
}
static inline size_t
rseq_cs_alloc_size(void)
{
return sizeof(struct rseq) + __alignof(struct rseq_cs);
}
static void
rseq_cs_free(dcontext_t *dcontext, void *data)
{
global_heap_free(data, rseq_cs_alloc_size() HEAPACCT(ACCT_OTHER));
}
void
d_r_rseq_init(void)
{
VMVECTOR_ALLOC_VECTOR(d_r_rseq_areas, GLOBAL_DCONTEXT,
VECTOR_SHARED | VECTOR_NEVER_MERGE, rseq_areas);
vmvector_set_callbacks(d_r_rseq_areas, rseq_area_free, rseq_area_dup, NULL, NULL);
rseq_cs_table = generic_hash_create(GLOBAL_DCONTEXT, INIT_RSEQ_CS_TABLE_SIZE, 80,
HASHTABLE_SHARED | HASHTABLE_PERSISTENT,
rseq_cs_free _IF_DEBUG("rseq_cs table"));
/* Enable rseq pre-attach for things like dr_prepopulate_cache(). */
if (rseq_is_registered_for_current_thread())
rseq_locate_rseq_regions();
}
void
d_r_rseq_exit(void)
{
generic_hash_destroy(GLOBAL_DCONTEXT, rseq_cs_table);
vmvector_delete_vector(GLOBAL_DCONTEXT, d_r_rseq_areas);
DELETE_LOCK(rseq_trigger_lock);
}
void
rseq_thread_attach(dcontext_t *dcontext)
{
rseq_region_t *info;
if (!vmvector_lookup_data(d_r_rseq_areas, dcontext->next_tag, NULL, NULL,
(void **)&info))
return;
/* The thread missed the save of its state on rseq entry. We could try to save here
* so the restore on rseq exit won't read incorrect values, but it's simpler and
* less error-prone to send it to the abort handler, like we do on detach or other
* translation points.
*/
dcontext->next_tag = info->handler;
}
bool
rseq_get_region_info(app_pc pc, app_pc *start OUT, app_pc *end OUT, app_pc *handler OUT,
bool **reg_written OUT, int *reg_written_size OUT)
{
rseq_region_t *info;
if (!vmvector_lookup_data(d_r_rseq_areas, pc, start, end, (void **)&info))
return false;
if (handler != NULL)
*handler = info->handler;
if (reg_written != NULL)
*reg_written = info->reg_written;
if (reg_written_size != NULL)
*reg_written_size = sizeof(info->reg_written) / sizeof(info->reg_written[0]);
return true;
}
int
rseq_get_tls_ptr_offset(void)
{
/* This read is assumed to be atomic. */
ASSERT(rseq_tls_offset != 0);
return rseq_tls_offset + offsetof(struct rseq, rseq_cs);
}
static void
rseq_clear_tls_ptr(dcontext_t *dcontext)
{
ASSERT(rseq_tls_offset != 0);
byte *base = get_segment_base(LIB_SEG_TLS);
struct rseq *app_rseq = (struct rseq *)(base + rseq_tls_offset);
/* We're directly writing this in the cache, so we do not bother with safe_read
* or safe_write here either. We already cannot handle rseq adversarial cases.
*/
if (is_dynamo_address((byte *)(ptr_uint_t)app_rseq->rseq_cs.ptr64))
app_rseq->rseq_cs.ptr64 = 0;
}
int
rseq_get_signature(void)
{
/* This is only called after rseq is initialized and the signature determined. */
ASSERT(rseq_enabled);
return rseq_signature;
}
byte *
rseq_get_rseq_cs_alloc(byte **rseq_cs_aligned OUT)
{
byte *rseq_cs_alloc = global_heap_alloc(rseq_cs_alloc_size() HEAPACCT(ACCT_OTHER));
*rseq_cs_aligned = (byte *)ALIGN_FORWARD(rseq_cs_alloc, __alignof(struct rseq_cs));
return rseq_cs_alloc;
}
void
rseq_record_rseq_cs(byte *rseq_cs_alloc, fragment_t *f, cache_pc start, cache_pc end,
cache_pc abort)
{
struct rseq_cs *target =
(struct rseq_cs *)ALIGN_FORWARD(rseq_cs_alloc, __alignof(struct rseq_cs));
target->version = 0;
target->flags = 0;
target->start_ip = (ptr_uint_t)start;
target->post_commit_offset = (ptr_uint_t)(end - start);
target->abort_ip = (ptr_uint_t)abort;
TABLE_RWLOCK(rseq_cs_table, write, lock);
generic_hash_add(GLOBAL_DCONTEXT, rseq_cs_table, (ptr_uint_t)f, rseq_cs_alloc);
TABLE_RWLOCK(rseq_cs_table, write, unlock);
}
void
rseq_remove_fragment(dcontext_t *dcontext, fragment_t *f)
{
if (!rseq_enabled)
return;
/* Avoid freeing a live rseq_cs for a thread-private fragment deletion. */
rseq_clear_tls_ptr(dcontext);
TABLE_RWLOCK(rseq_cs_table, write, lock);
generic_hash_remove(GLOBAL_DCONTEXT, rseq_cs_table, (ptr_uint_t)f);
TABLE_RWLOCK(rseq_cs_table, write, unlock);
}
void
rseq_shared_fragment_flushtime_update(dcontext_t *dcontext)
{
if (!rseq_enabled)
return;
/* Avoid freeing a live rseq_cs for thread-shared fragment deletion.
* We clear the pointer on completion of the native rseq execution, but it's
* not easy to clear it on midpoint exits. We instead clear prior to
* rseq_cs being freed: for thread-private in rseq_remove_fragment() and for
* thread-shared each thread should come here prior to deletion.
*/
rseq_clear_tls_ptr(dcontext);
}
#ifdef HAVE_RSEQ
bool
rseq_is_registered_for_current_thread(void)
{
/* Unfortunately there's no way to query the current rseq struct.
* For 64-bit we can pass a kernel address and look for EFAULT
* vs EINVAL, but there is no kernel address for 32-bit.
* So we try to perform a legitimate registration.
*/
struct rseq test_rseq = {};
int res = dynamorio_syscall(SYS_rseq, 4, &test_rseq, sizeof(test_rseq), 0, 0);
if (res == -EINVAL) /* Our struct != registered struct. */
return true;
if (res == -ENOSYS)
return false;
/* If seccomp blocks SYS_rseq we'll get -EPERM. SYS_rseq also returns -EPERM
* if &test_rseq == the app's struct but the signature is different, but that
* seems so unlikely that we just assume -EPERM implies seccomp.
*/
if (res == -EPERM)
return false;
ASSERT(res == 0); /* If not, the struct size or sthg changed! */
if (dynamorio_syscall(SYS_rseq, 4, &test_rseq, sizeof(test_rseq),
RSEQ_FLAG_UNREGISTER, 0) != 0) {
ASSERT_NOT_REACHED();
}
return false;
}
#else
bool
rseq_is_registered_for_current_thread(void)
{
return false;
}
#endif
static void
rseq_analyze_instructions(rseq_region_t *info)
{
/* We analyze the instructions inside [start,end) looking for register state that we
* need to preserve for our restart. We do not want to blindly spill and restore
* 16+ registers for every sequence (too much overhead).
*/
instr_t instr;
instr_init(GLOBAL_DCONTEXT, &instr);
app_pc pc = info->start;
int i;
bool reached_cti = false;
memset(info->reg_written, 0, sizeof(info->reg_written));
while (pc < info->end) {
instr_reset(GLOBAL_DCONTEXT, &instr);
app_pc next_pc = decode(GLOBAL_DCONTEXT, pc, &instr);
if (next_pc == NULL) {
REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3,
get_application_name(), get_application_pid(),
"Rseq sequence contains invalid instructions");
ASSERT_NOT_REACHED();
}
if (instr_is_syscall(&instr)
/* Allow a syscall for our test in debug build. */
IF_DEBUG(
&&!check_filter("api.rseq;linux.rseq;linux.rseq_table;linux.rseq_noarray",
get_short_name(get_application_name())))) {
REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3,
get_application_name(), get_application_pid(),
"Rseq sequence contains a system call");
ASSERT_NOT_REACHED();
}
if (instr_is_call(&instr)) {
REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3,
get_application_name(), get_application_pid(),
"Rseq sequence contains a call");
ASSERT_NOT_REACHED();
}
if (instr_is_cti(&instr))
reached_cti = true;
/* We potentially need to preserve any register written anywhere inside
* the sequence. We can't limit ourselves to registers clearly live on
* input, since code *after* the sequence could read them. We do disallow
* callouts to helper functions to simplify our lives.
*
* We only preserve GPR's, for simplicity, and because they are far more likely
* as inputs than flags or SIMD registers. We'd like to verify that only GPR's
* are used, but A) we can't easily check values read *after* the sequence (the
* handler could set up state read afterward and sometimes clobbered inside), B)
* we do want to support SIMD and flags writes in the sequence, and C) even
* checking for values read in the sequence would want new interfaces like
* DR_REG_START_SIMD or register iterators for reasonable code.
*/
for (i = 0; i < DR_NUM_GPR_REGS; i++) {
if (info->reg_written[i])
continue;
reg_id_t reg = DR_REG_START_GPR + (reg_id_t)i;
if (instr_writes_to_reg(&instr, reg, DR_QUERY_DEFAULT)) {
LOG(GLOBAL, LOG_LOADER, 3,
"Rseq region @" PFX " writes register %s at " PFX "\n", info->start,
reg_names[reg], pc);
info->reg_written[i] = true;
}
}
pc = next_pc;
}
instr_free(GLOBAL_DCONTEXT, &instr);
}
static void
rseq_process_entry(struct rseq_cs *entry, ssize_t load_offs)
{
LOG(GLOBAL, LOG_LOADER, 2,
"Found rseq region: ver=%u; flags=%u; start=" PFX "; end=" PFX "; abort=" PFX
"\n",
entry->version, entry->flags, entry->start_ip + load_offs,
entry->start_ip + entry->post_commit_offset + load_offs,
entry->abort_ip + load_offs);
rseq_region_t *info =
HEAP_TYPE_ALLOC(GLOBAL_DCONTEXT, rseq_region_t, ACCT_VMAREAS, PROTECTED);
info->start = (app_pc)(ptr_uint_t)entry->start_ip + load_offs;
info->end = info->start + entry->post_commit_offset;
info->handler = (app_pc)(ptr_uint_t)entry->abort_ip + load_offs;
int signature;
if (!d_r_safe_read(info->handler - sizeof(signature), sizeof(signature),
&signature)) {
REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(),
get_application_pid(),
"Rseq signature is unreadable");
ASSERT_NOT_REACHED();
}
if (signature != rseq_signature) {
if (rseq_signature == 0) {
SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
ATOMIC_4BYTE_WRITE(&rseq_signature, signature, false);
SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
LOG(GLOBAL, LOG_LOADER, 2, "Rseq signature is 0x%08x\n", rseq_signature);
} else {
REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3,
get_application_name(), get_application_pid(),
"Rseq signatures are not all identical");
ASSERT_NOT_REACHED();
}
}
rseq_analyze_instructions(info);
vmvector_add(d_r_rseq_areas, info->start, info->end, (void *)info);
RSTATS_INC(num_rseq_regions);
/* Check the start pc. We don't take the effort to check for non-tags or
* interior pc's.
*/
if (fragment_lookup(GLOBAL_DCONTEXT, info->start) != NULL) {
/* We rely on the app not running rseq code for non-rseq purposes (since we
* can't easily tell the difference; plus we avoid a flush for lazy rseq
* activation).
*/
REPORT_FATAL_ERROR_AND_EXIT(
RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), get_application_pid(),
"Rseq sequences must not be used for non-rseq purposes");
ASSERT_NOT_REACHED();
}
}
static void
rseq_process_elf_sections(module_area_t *ma, bool at_map,
ELF_SECTION_HEADER_TYPE *sec_hdr_start, const char *strtab,
ssize_t load_offs)
{
bool found_array = false;
uint i;
ELF_HEADER_TYPE *elf_hdr = (ELF_HEADER_TYPE *)ma->start;
ELF_SECTION_HEADER_TYPE *sec_hdr = sec_hdr_start;
/* The section entries on disk need load_offs. The rseq entries in memory are
* relocated and only need the offset if relocations have not yet been applied.
*/
ssize_t entry_offs = 0;
if (at_map || (DYNAMO_OPTION(early_inject) && !dr_api_entry && !dynamo_started))
entry_offs = load_offs;
for (i = 0; i < elf_hdr->e_shnum; i++) {
#define RSEQ_PTR_ARRAY_SEC_NAME "__rseq_cs_ptr_array"
if (strcmp(strtab + sec_hdr->sh_name, RSEQ_PTR_ARRAY_SEC_NAME) == 0) {
found_array = true;
byte **ptrs = (byte **)(sec_hdr->sh_addr + load_offs);
int j;
for (j = 0; j < sec_hdr->sh_size / sizeof(ptrs); ++j) {
/* We require that the table is loaded. If not, bail, but unlike
* failing to find section headers, make this a fatal error: better
* to notify the user than try to run the rseq w/o proper handling.
*/
if (ptrs < (byte **)ma->start || ptrs > (byte **)ma->end) {
REPORT_FATAL_ERROR_AND_EXIT(
RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(),
get_application_pid(),
RSEQ_PTR_ARRAY_SEC_NAME " is not in a loaded segment");
ASSERT_NOT_REACHED();
}
/* We assume this is a full mapping and it's safe to read the data
* (a partial map shouldn't make it to module list processing).
* We do perform a sanity check to handle unusual non-relocated
* cases (it's possible this array is not in a loaded segment?).
*/
byte *entry = *ptrs + entry_offs;
if (entry < ma->start || entry > ma->end) {
REPORT_FATAL_ERROR_AND_EXIT(
RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(),
get_application_pid(),
RSEQ_PTR_ARRAY_SEC_NAME "'s entries are not in a loaded segment");
ASSERT_NOT_REACHED();
}
rseq_process_entry((struct rseq_cs *)entry, entry_offs);
++ptrs;
}
break;
}
++sec_hdr;
}
if (!found_array) {
sec_hdr = sec_hdr_start;
for (i = 0; i < elf_hdr->e_shnum; i++) {
#define RSEQ_SEC_NAME "__rseq_cs"
#define RSEQ_OLD_SEC_NAME "__rseq_table"
if (strcmp(strtab + sec_hdr->sh_name, RSEQ_SEC_NAME) == 0 ||
strcmp(strtab + sec_hdr->sh_name, RSEQ_OLD_SEC_NAME) == 0) {
/* There may be padding at the start of the section, so ensure we skip
* over it. We're reading the loaded data, not the file, so it will
* always be aligned.
*/
#define RSEQ_CS_ALIGNMENT (4 * sizeof(__u64))
struct rseq_cs *array = (struct rseq_cs *)ALIGN_FORWARD(
sec_hdr->sh_addr + load_offs, RSEQ_CS_ALIGNMENT);
int j;
for (j = 0; j < sec_hdr->sh_size / sizeof(*array); ++j) {
/* We require that the table is loaded. If not, bail. */
if (array < (struct rseq_cs *)ma->start ||
array > (struct rseq_cs *)ma->end) {
REPORT_FATAL_ERROR_AND_EXIT(
RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(),
get_application_pid(),
RSEQ_SEC_NAME " is not in a loaded segment");
ASSERT_NOT_REACHED();
}
rseq_process_entry(array, entry_offs);
++array;
}
break;
}
++sec_hdr;
}
}
}
/* Returns whether successfully searched for rseq data (not whether found rseq data). */
static bool
rseq_process_module(module_area_t *ma, bool at_map)
{
bool res = false;
ASSERT(is_elf_so_header(ma->start, ma->end - ma->start));
ELF_HEADER_TYPE *elf_hdr = (ELF_HEADER_TYPE *)ma->start;
ASSERT(elf_hdr->e_shentsize == sizeof(ELF_SECTION_HEADER_TYPE));
int fd = INVALID_FILE;
byte *sec_map = NULL, *str_map = NULL;
size_t sec_size = 0, str_size = 0;
ELF_SECTION_HEADER_TYPE *sec_hdr = NULL;
char *strtab;
ssize_t load_offs = ma->start - ma->os_data.base_address;
if (at_map && elf_hdr->e_shoff + ma->start < ma->end) {
sec_map = elf_hdr->e_shoff + ma->start;
sec_hdr = (ELF_SECTION_HEADER_TYPE *)sec_map;
/* We assume strtab is there too. */
strtab = (char *)(ma->start + sec_hdr[elf_hdr->e_shstrndx].sh_offset);
if (strtab > (char *)ma->end)
goto rseq_process_module_cleanup;
} else {
/* The section headers are not mapped in. Unfortunately this is the common
* case: they are typically at the end of the file. For this reason, we delay
* calling this function until we see the app use rseq.
*/
if (ma->full_path == NULL)
goto rseq_process_module_cleanup;
fd = os_open(ma->full_path, OS_OPEN_READ);
if (fd == INVALID_FILE)
goto rseq_process_module_cleanup;
off_t offs = ALIGN_BACKWARD(elf_hdr->e_shoff, PAGE_SIZE);
sec_size =
ALIGN_FORWARD(elf_hdr->e_shoff + elf_hdr->e_shnum * elf_hdr->e_shentsize,
PAGE_SIZE) -
offs;
sec_map =
os_map_file(fd, &sec_size, offs, NULL, MEMPROT_READ, MAP_FILE_COPY_ON_WRITE);
if (sec_map == NULL)
goto rseq_process_module_cleanup;
sec_hdr = (ELF_SECTION_HEADER_TYPE *)(sec_map + elf_hdr->e_shoff - offs);
/* We also need the section header string table. */
offs = ALIGN_BACKWARD(sec_hdr[elf_hdr->e_shstrndx].sh_offset, PAGE_SIZE);
str_size = ALIGN_FORWARD(sec_hdr[elf_hdr->e_shstrndx].sh_offset +
sec_hdr[elf_hdr->e_shstrndx].sh_size,
PAGE_SIZE) -
offs;
str_map =
os_map_file(fd, &str_size, offs, NULL, MEMPROT_READ, MAP_FILE_COPY_ON_WRITE);
if (str_map == NULL)
goto rseq_process_module_cleanup;
strtab = (char *)(str_map + sec_hdr[elf_hdr->e_shstrndx].sh_offset - offs);
}
rseq_process_elf_sections(ma, at_map, sec_hdr, strtab, load_offs);
res = true;
rseq_process_module_cleanup:
if (str_size != 0)
os_unmap_file(str_map, str_size);
if (sec_size != 0)
os_unmap_file(sec_map, sec_size);
if (fd != INVALID_FILE)
os_close(fd);
DODEBUG({
if (!res) {
const char *name = GET_MODULE_NAME(&ma->names);
if (name == NULL)
name = "(null)";
LOG(GLOBAL, LOG_INTERP | LOG_VMAREAS, 2,
"%s: error looking for rseq table in %s\n", __FUNCTION__, name);
if (strstr(name, "linux-vdso.so") == NULL) {
SYSLOG_INTERNAL_WARNING_ONCE(
"Failed to identify whether a module has an rseq table");
}
}
});
return res;
}
static int
rseq_locate_tls_offset(void)
{
/* We assume (and document) that the loader's static TLS is used, so every thread
* has a consistent %fs:-offs address. Unfortunately, using a local copy of the
* rseq code for our non-instrumented execution requires us to locate the app's
* struct using heuristics, because the system call was poorly designed and will not
* let us replace the app's. Alternatives of no local copy have worse problems.
*/
/* Static TLS is at a negative offset from the app library segment base. We simply
* search all possible aligned slots. Typically there are <64 possible slots.
*/
int offset = 0;
byte *addr = get_app_segment_base(LIB_SEG_TLS);
byte *seg_bottom;
if (addr > 0 && get_memory_info(addr, &seg_bottom, NULL, NULL)) {
LOG(GLOBAL, LOG_LOADER, 3, "rseq within static TLS " PFX " - " PFX "\n",
seg_bottom, addr);
/* struct rseq_cs is aligned to 32. */
int alignment = __alignof(struct rseq_cs);
int i;
for (i = 0; addr - i * alignment >= seg_bottom; i++) {
byte *try_addr = addr - i * alignment;
ASSERT(try_addr >= seg_bottom); /* For loop guarantees this. */
/* Our strategy is to check all of the aligned static TLS addresses to
* find the registered one. Our caller is not supposed to call here
* until the app has registered the current thread.
*/
static const int RSEQ_RARE_SIGNATURE = 42;
int res = dynamorio_syscall(SYS_rseq, 4, try_addr, sizeof(struct rseq),
RSEQ_FLAG_UNREGISTER, RSEQ_RARE_SIGNATURE);
LOG(GLOBAL, LOG_LOADER, 3, "Tried rseq @ " PFX " => %d\n", try_addr, res);
if (res == -EINVAL) /* Our struct != registered struct. */
continue;
/* We expect -EPERM on a signature mismatch. On the small chance the app
* actually used 42 for its signature we'll have to re-register it.
*/
if (res == 0) {
int res = dynamorio_syscall(SYS_rseq, 4, try_addr, sizeof(struct rseq), 0,
RSEQ_RARE_SIGNATURE);
ASSERT(res == 0);
res = -EPERM;
}
if (res == -EPERM) {
/* Found it! */
LOG(GLOBAL, LOG_LOADER, 2,
"Found struct rseq @ " PFX " for thread => %s:-0x%x\n", try_addr,
get_register_name(LIB_SEG_TLS), i * alignment);
offset = -i * alignment;
}
break;
}
}
return offset;
}
void
rseq_process_syscall(dcontext_t *dcontext)
{
byte *seg_base = get_app_segment_base(LIB_SEG_TLS);
byte *app_addr = (byte *)dcontext->sys_param0;
bool constant_offset = false;
if (rseq_tls_offset == 0) {
SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
int offset = app_addr - seg_base;
/* To handle races here, we use an atomic_exchange. */
int prior = atomic_exchange_int(&rseq_tls_offset, offset);
SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
constant_offset = (prior == 0 || prior == offset);
LOG(GLOBAL, LOG_LOADER, 2,
"Observed struct rseq @ " PFX " for thread => %s:-0x%x\n", app_addr,
get_register_name(LIB_SEG_TLS), -rseq_tls_offset);
} else
constant_offset = (seg_base + rseq_tls_offset == app_addr);
if (!constant_offset) {
REPORT_FATAL_ERROR_AND_EXIT(
RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), get_application_pid(),
"struct rseq is not always in static thread-local storage");
ASSERT_NOT_REACHED();
}
}
/* Restartable sequence region identification.
*
* To avoid extra overhead going to disk to read section headers, we delay looking
* for rseq data until the app invokes an rseq syscall (or on attach we see a thread
* that has rseq set up). We document that we do not handle the app using rseq
* regions for non-rseq purposes, so we do not need to flush the cache here.
* Since we also identify the rseq_cs address here, this should be called *after*
* the app has registered the current thread for rseq.
*/
void
rseq_locate_rseq_regions(void)
{
if (rseq_enabled)
return;
/* This is a global operation, but the trigger could be hit by two threads at once,
* thus requiring synchronization.
*/
d_r_mutex_lock(&rseq_trigger_lock);
if (rseq_enabled) {
d_r_mutex_unlock(&rseq_trigger_lock);
return;
}
int offset = 0;
if (rseq_tls_offset == 0) {
/* Identify the TLS offset of this thread's struct rseq. */
offset = rseq_locate_tls_offset();
if (offset == 0) {
REPORT_FATAL_ERROR_AND_EXIT(
RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(),
get_application_pid(),
"struct rseq is not in static thread-local storage");
ASSERT_NOT_REACHED();
}
}
SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
bool new_value = true;
ATOMIC_1BYTE_WRITE(&rseq_enabled, new_value, false);
if (rseq_tls_offset == 0)
ATOMIC_4BYTE_WRITE(&rseq_tls_offset, offset, false);
SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
module_iterator_t *iter = module_iterator_start();
while (module_iterator_hasnext(iter)) {
module_area_t *ma = module_iterator_next(iter);
rseq_process_module(ma, false /*!at_map*/);
}
module_iterator_stop(iter);
d_r_mutex_unlock(&rseq_trigger_lock);
}
void
rseq_module_init(module_area_t *ma, bool at_map)
{
if (rseq_enabled) {
rseq_process_module(ma, at_map);
}
}
void
rseq_process_native_abort(dcontext_t *dcontext)
{
#ifdef CLIENT_INTERFACE
/* Raise a transfer event. */
LOG(THREAD, LOG_INTERP | LOG_VMAREAS, 2, "Abort triggered in rseq native code\n");
get_mcontext(dcontext)->pc = dcontext->next_tag;
if (instrument_kernel_xfer(dcontext, DR_XFER_RSEQ_ABORT, osc_empty,
/* We do not know the source PC so we do not
* supply a source state.
*/
NULL, NULL, dcontext->next_tag,
get_mcontext(dcontext)->xsp, osc_empty,
get_mcontext(dcontext), 0)) {
dcontext->next_tag = canonicalize_pc_target(dcontext, get_mcontext(dcontext)->pc);
}
#endif
}