| /* ******************************************************************************* |
| * Copyright (c) 2019-2020 Google, Inc. All rights reserved. |
| * *******************************************************************************/ |
| |
| /* |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * * Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * * Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * * Neither the name of Google, Inc. nor the names of its contributors may be |
| * used to endorse or promote products derived from this software without |
| * specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE |
| * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH |
| * DAMAGE. |
| */ |
| |
| /************************************************************************************** |
| * Restartable sequence ("rseq") support (i#2350). |
| * This is a kernel feature which provides cpu-atomic regions: if a thread |
| * is pre-empted within an rseq region, an abort handler is invoked. |
| * The feature is difficult to handle under binary instrumentation. |
| * We rely on the app following certain conventions, including containing a |
| * section holding a table of all rseq sequences. |
| */ |
| |
| #include "../globals.h" |
| #include "../module_shared.h" |
| #include "module_private.h" |
| #include "os_private.h" |
| #include "rseq_linux.h" |
| #include "../fragment.h" |
| #include "decode.h" |
| #ifdef CLIENT_INTERFACE |
| # include "instrument.h" |
| #endif |
| #include <stddef.h> |
| #ifdef HAVE_RSEQ |
| # include <linux/rseq.h> |
| #else |
| struct rseq_cs { |
| uint version; |
| uint flags; |
| uint64 start_ip; |
| uint64 post_commit_offset; |
| uint64 abort_ip; |
| } __attribute__((aligned(4 * sizeof(uint64)))); |
| struct rseq { |
| uint cpu_id_start; |
| uint cpu_id; |
| union { |
| uint64 ptr64; |
| } rseq_cs; |
| uint flags; |
| } __attribute__((aligned(4 * sizeof(uint64)))); |
| # define RSEQ_FLAG_UNREGISTER 1 |
| #endif |
| #include "include/syscall.h" |
| #include <errno.h> |
| |
| vm_area_vector_t *d_r_rseq_areas; |
| DECLARE_CXTSWPROT_VAR(static mutex_t rseq_trigger_lock, |
| INIT_LOCK_FREE(rseq_trigger_lock)); |
| static volatile bool rseq_enabled; |
| |
| /* We require all threads to use the same TLS offset to point at struct rseq. */ |
| static int rseq_tls_offset; |
| |
| /* The signature is registered per thread, but we require all registrations |
| * to be the same. |
| */ |
| static int rseq_signature; |
| |
| typedef struct _rseq_region_t { |
| app_pc start; |
| app_pc end; |
| app_pc handler; |
| /* We need to preserve input registers for targeting "start" instead of "handler" |
| * for our 2nd invocation, if they're written in the rseq region. We only support |
| * GPR inputs. We document that we do not support any other inputs (no flags, no |
| * SIMD registers). |
| */ |
| bool reg_written[DR_NUM_GPR_REGS]; |
| } rseq_region_t; |
| |
| /* We need to store a struct rseq_cs per fragment_t. To avoid the cost of adding a |
| * pointer field to every fragment_t, and the complexity of another subclass like |
| * trace_t, we store them externally in a hashtable. The FRAG_HAS_RSEQ_ENDPOINT flag |
| * avoids the hashtable lookup on every fragment. |
| */ |
| static generic_table_t *rseq_cs_table; |
| #define INIT_RSEQ_CS_TABLE_SIZE 5 |
| |
| /* vmvector callbacks */ |
| static void |
| rseq_area_free(void *data) |
| { |
| HEAP_TYPE_FREE(GLOBAL_DCONTEXT, data, rseq_region_t, ACCT_VMAREAS, PROTECTED); |
| } |
| |
| static void * |
| rseq_area_dup(void *data) |
| { |
| rseq_region_t *src = (rseq_region_t *)data; |
| rseq_region_t *dst = |
| HEAP_TYPE_ALLOC(GLOBAL_DCONTEXT, rseq_region_t, ACCT_VMAREAS, PROTECTED); |
| ASSERT(src != NULL); |
| *dst = *src; |
| return dst; |
| } |
| |
| static inline size_t |
| rseq_cs_alloc_size(void) |
| { |
| return sizeof(struct rseq) + __alignof(struct rseq_cs); |
| } |
| |
| static void |
| rseq_cs_free(dcontext_t *dcontext, void *data) |
| { |
| global_heap_free(data, rseq_cs_alloc_size() HEAPACCT(ACCT_OTHER)); |
| } |
| |
| void |
| d_r_rseq_init(void) |
| { |
| VMVECTOR_ALLOC_VECTOR(d_r_rseq_areas, GLOBAL_DCONTEXT, |
| VECTOR_SHARED | VECTOR_NEVER_MERGE, rseq_areas); |
| vmvector_set_callbacks(d_r_rseq_areas, rseq_area_free, rseq_area_dup, NULL, NULL); |
| |
| rseq_cs_table = generic_hash_create(GLOBAL_DCONTEXT, INIT_RSEQ_CS_TABLE_SIZE, 80, |
| HASHTABLE_SHARED | HASHTABLE_PERSISTENT, |
| rseq_cs_free _IF_DEBUG("rseq_cs table")); |
| /* Enable rseq pre-attach for things like dr_prepopulate_cache(). */ |
| if (rseq_is_registered_for_current_thread()) |
| rseq_locate_rseq_regions(); |
| } |
| |
| void |
| d_r_rseq_exit(void) |
| { |
| generic_hash_destroy(GLOBAL_DCONTEXT, rseq_cs_table); |
| vmvector_delete_vector(GLOBAL_DCONTEXT, d_r_rseq_areas); |
| DELETE_LOCK(rseq_trigger_lock); |
| } |
| |
| void |
| rseq_thread_attach(dcontext_t *dcontext) |
| { |
| rseq_region_t *info; |
| if (!vmvector_lookup_data(d_r_rseq_areas, dcontext->next_tag, NULL, NULL, |
| (void **)&info)) |
| return; |
| /* The thread missed the save of its state on rseq entry. We could try to save here |
| * so the restore on rseq exit won't read incorrect values, but it's simpler and |
| * less error-prone to send it to the abort handler, like we do on detach or other |
| * translation points. |
| */ |
| dcontext->next_tag = info->handler; |
| } |
| |
| bool |
| rseq_get_region_info(app_pc pc, app_pc *start OUT, app_pc *end OUT, app_pc *handler OUT, |
| bool **reg_written OUT, int *reg_written_size OUT) |
| { |
| rseq_region_t *info; |
| if (!vmvector_lookup_data(d_r_rseq_areas, pc, start, end, (void **)&info)) |
| return false; |
| if (handler != NULL) |
| *handler = info->handler; |
| if (reg_written != NULL) |
| *reg_written = info->reg_written; |
| if (reg_written_size != NULL) |
| *reg_written_size = sizeof(info->reg_written) / sizeof(info->reg_written[0]); |
| return true; |
| } |
| |
| int |
| rseq_get_tls_ptr_offset(void) |
| { |
| /* This read is assumed to be atomic. */ |
| ASSERT(rseq_tls_offset != 0); |
| return rseq_tls_offset + offsetof(struct rseq, rseq_cs); |
| } |
| |
| static void |
| rseq_clear_tls_ptr(dcontext_t *dcontext) |
| { |
| ASSERT(rseq_tls_offset != 0); |
| byte *base = get_segment_base(LIB_SEG_TLS); |
| struct rseq *app_rseq = (struct rseq *)(base + rseq_tls_offset); |
| /* We're directly writing this in the cache, so we do not bother with safe_read |
| * or safe_write here either. We already cannot handle rseq adversarial cases. |
| */ |
| if (is_dynamo_address((byte *)(ptr_uint_t)app_rseq->rseq_cs.ptr64)) |
| app_rseq->rseq_cs.ptr64 = 0; |
| } |
| |
| int |
| rseq_get_signature(void) |
| { |
| /* This is only called after rseq is initialized and the signature determined. */ |
| ASSERT(rseq_enabled); |
| return rseq_signature; |
| } |
| |
| byte * |
| rseq_get_rseq_cs_alloc(byte **rseq_cs_aligned OUT) |
| { |
| byte *rseq_cs_alloc = global_heap_alloc(rseq_cs_alloc_size() HEAPACCT(ACCT_OTHER)); |
| *rseq_cs_aligned = (byte *)ALIGN_FORWARD(rseq_cs_alloc, __alignof(struct rseq_cs)); |
| return rseq_cs_alloc; |
| } |
| |
| void |
| rseq_record_rseq_cs(byte *rseq_cs_alloc, fragment_t *f, cache_pc start, cache_pc end, |
| cache_pc abort) |
| { |
| struct rseq_cs *target = |
| (struct rseq_cs *)ALIGN_FORWARD(rseq_cs_alloc, __alignof(struct rseq_cs)); |
| target->version = 0; |
| target->flags = 0; |
| target->start_ip = (ptr_uint_t)start; |
| target->post_commit_offset = (ptr_uint_t)(end - start); |
| target->abort_ip = (ptr_uint_t)abort; |
| TABLE_RWLOCK(rseq_cs_table, write, lock); |
| generic_hash_add(GLOBAL_DCONTEXT, rseq_cs_table, (ptr_uint_t)f, rseq_cs_alloc); |
| TABLE_RWLOCK(rseq_cs_table, write, unlock); |
| } |
| |
| void |
| rseq_remove_fragment(dcontext_t *dcontext, fragment_t *f) |
| { |
| if (!rseq_enabled) |
| return; |
| /* Avoid freeing a live rseq_cs for a thread-private fragment deletion. */ |
| rseq_clear_tls_ptr(dcontext); |
| TABLE_RWLOCK(rseq_cs_table, write, lock); |
| generic_hash_remove(GLOBAL_DCONTEXT, rseq_cs_table, (ptr_uint_t)f); |
| TABLE_RWLOCK(rseq_cs_table, write, unlock); |
| } |
| |
| void |
| rseq_shared_fragment_flushtime_update(dcontext_t *dcontext) |
| { |
| if (!rseq_enabled) |
| return; |
| /* Avoid freeing a live rseq_cs for thread-shared fragment deletion. |
| * We clear the pointer on completion of the native rseq execution, but it's |
| * not easy to clear it on midpoint exits. We instead clear prior to |
| * rseq_cs being freed: for thread-private in rseq_remove_fragment() and for |
| * thread-shared each thread should come here prior to deletion. |
| */ |
| rseq_clear_tls_ptr(dcontext); |
| } |
| |
| #ifdef HAVE_RSEQ |
| bool |
| rseq_is_registered_for_current_thread(void) |
| { |
| /* Unfortunately there's no way to query the current rseq struct. |
| * For 64-bit we can pass a kernel address and look for EFAULT |
| * vs EINVAL, but there is no kernel address for 32-bit. |
| * So we try to perform a legitimate registration. |
| */ |
| struct rseq test_rseq = {}; |
| int res = dynamorio_syscall(SYS_rseq, 4, &test_rseq, sizeof(test_rseq), 0, 0); |
| if (res == -EINVAL) /* Our struct != registered struct. */ |
| return true; |
| if (res == -ENOSYS) |
| return false; |
| /* If seccomp blocks SYS_rseq we'll get -EPERM. SYS_rseq also returns -EPERM |
| * if &test_rseq == the app's struct but the signature is different, but that |
| * seems so unlikely that we just assume -EPERM implies seccomp. |
| */ |
| if (res == -EPERM) |
| return false; |
| ASSERT(res == 0); /* If not, the struct size or sthg changed! */ |
| if (dynamorio_syscall(SYS_rseq, 4, &test_rseq, sizeof(test_rseq), |
| RSEQ_FLAG_UNREGISTER, 0) != 0) { |
| ASSERT_NOT_REACHED(); |
| } |
| return false; |
| } |
| #else |
| bool |
| rseq_is_registered_for_current_thread(void) |
| { |
| return false; |
| } |
| #endif |
| |
| static void |
| rseq_analyze_instructions(rseq_region_t *info) |
| { |
| /* We analyze the instructions inside [start,end) looking for register state that we |
| * need to preserve for our restart. We do not want to blindly spill and restore |
| * 16+ registers for every sequence (too much overhead). |
| */ |
| instr_t instr; |
| instr_init(GLOBAL_DCONTEXT, &instr); |
| app_pc pc = info->start; |
| int i; |
| bool reached_cti = false; |
| memset(info->reg_written, 0, sizeof(info->reg_written)); |
| while (pc < info->end) { |
| instr_reset(GLOBAL_DCONTEXT, &instr); |
| app_pc next_pc = decode(GLOBAL_DCONTEXT, pc, &instr); |
| if (next_pc == NULL) { |
| REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, |
| get_application_name(), get_application_pid(), |
| "Rseq sequence contains invalid instructions"); |
| ASSERT_NOT_REACHED(); |
| } |
| if (instr_is_syscall(&instr) |
| /* Allow a syscall for our test in debug build. */ |
| IF_DEBUG( |
| &&!check_filter("api.rseq;linux.rseq;linux.rseq_table;linux.rseq_noarray", |
| get_short_name(get_application_name())))) { |
| REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, |
| get_application_name(), get_application_pid(), |
| "Rseq sequence contains a system call"); |
| ASSERT_NOT_REACHED(); |
| } |
| if (instr_is_call(&instr)) { |
| REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, |
| get_application_name(), get_application_pid(), |
| "Rseq sequence contains a call"); |
| ASSERT_NOT_REACHED(); |
| } |
| if (instr_is_cti(&instr)) |
| reached_cti = true; |
| /* We potentially need to preserve any register written anywhere inside |
| * the sequence. We can't limit ourselves to registers clearly live on |
| * input, since code *after* the sequence could read them. We do disallow |
| * callouts to helper functions to simplify our lives. |
| * |
| * We only preserve GPR's, for simplicity, and because they are far more likely |
| * as inputs than flags or SIMD registers. We'd like to verify that only GPR's |
| * are used, but A) we can't easily check values read *after* the sequence (the |
| * handler could set up state read afterward and sometimes clobbered inside), B) |
| * we do want to support SIMD and flags writes in the sequence, and C) even |
| * checking for values read in the sequence would want new interfaces like |
| * DR_REG_START_SIMD or register iterators for reasonable code. |
| */ |
| for (i = 0; i < DR_NUM_GPR_REGS; i++) { |
| if (info->reg_written[i]) |
| continue; |
| reg_id_t reg = DR_REG_START_GPR + (reg_id_t)i; |
| if (instr_writes_to_reg(&instr, reg, DR_QUERY_DEFAULT)) { |
| LOG(GLOBAL, LOG_LOADER, 3, |
| "Rseq region @" PFX " writes register %s at " PFX "\n", info->start, |
| reg_names[reg], pc); |
| info->reg_written[i] = true; |
| } |
| } |
| pc = next_pc; |
| } |
| instr_free(GLOBAL_DCONTEXT, &instr); |
| } |
| |
| static void |
| rseq_process_entry(struct rseq_cs *entry, ssize_t load_offs) |
| { |
| LOG(GLOBAL, LOG_LOADER, 2, |
| "Found rseq region: ver=%u; flags=%u; start=" PFX "; end=" PFX "; abort=" PFX |
| "\n", |
| entry->version, entry->flags, entry->start_ip + load_offs, |
| entry->start_ip + entry->post_commit_offset + load_offs, |
| entry->abort_ip + load_offs); |
| rseq_region_t *info = |
| HEAP_TYPE_ALLOC(GLOBAL_DCONTEXT, rseq_region_t, ACCT_VMAREAS, PROTECTED); |
| info->start = (app_pc)(ptr_uint_t)entry->start_ip + load_offs; |
| info->end = info->start + entry->post_commit_offset; |
| info->handler = (app_pc)(ptr_uint_t)entry->abort_ip + load_offs; |
| int signature; |
| if (!d_r_safe_read(info->handler - sizeof(signature), sizeof(signature), |
| &signature)) { |
| REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), |
| get_application_pid(), |
| "Rseq signature is unreadable"); |
| ASSERT_NOT_REACHED(); |
| } |
| if (signature != rseq_signature) { |
| if (rseq_signature == 0) { |
| SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT); |
| ATOMIC_4BYTE_WRITE(&rseq_signature, signature, false); |
| SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT); |
| LOG(GLOBAL, LOG_LOADER, 2, "Rseq signature is 0x%08x\n", rseq_signature); |
| } else { |
| REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, |
| get_application_name(), get_application_pid(), |
| "Rseq signatures are not all identical"); |
| ASSERT_NOT_REACHED(); |
| } |
| } |
| rseq_analyze_instructions(info); |
| vmvector_add(d_r_rseq_areas, info->start, info->end, (void *)info); |
| RSTATS_INC(num_rseq_regions); |
| /* Check the start pc. We don't take the effort to check for non-tags or |
| * interior pc's. |
| */ |
| if (fragment_lookup(GLOBAL_DCONTEXT, info->start) != NULL) { |
| /* We rely on the app not running rseq code for non-rseq purposes (since we |
| * can't easily tell the difference; plus we avoid a flush for lazy rseq |
| * activation). |
| */ |
| REPORT_FATAL_ERROR_AND_EXIT( |
| RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), get_application_pid(), |
| "Rseq sequences must not be used for non-rseq purposes"); |
| ASSERT_NOT_REACHED(); |
| } |
| } |
| |
| static void |
| rseq_process_elf_sections(module_area_t *ma, bool at_map, |
| ELF_SECTION_HEADER_TYPE *sec_hdr_start, const char *strtab, |
| ssize_t load_offs) |
| { |
| bool found_array = false; |
| uint i; |
| ELF_HEADER_TYPE *elf_hdr = (ELF_HEADER_TYPE *)ma->start; |
| ELF_SECTION_HEADER_TYPE *sec_hdr = sec_hdr_start; |
| /* The section entries on disk need load_offs. The rseq entries in memory are |
| * relocated and only need the offset if relocations have not yet been applied. |
| */ |
| ssize_t entry_offs = 0; |
| if (at_map || (DYNAMO_OPTION(early_inject) && !dr_api_entry && !dynamo_started)) |
| entry_offs = load_offs; |
| for (i = 0; i < elf_hdr->e_shnum; i++) { |
| #define RSEQ_PTR_ARRAY_SEC_NAME "__rseq_cs_ptr_array" |
| if (strcmp(strtab + sec_hdr->sh_name, RSEQ_PTR_ARRAY_SEC_NAME) == 0) { |
| found_array = true; |
| byte **ptrs = (byte **)(sec_hdr->sh_addr + load_offs); |
| int j; |
| for (j = 0; j < sec_hdr->sh_size / sizeof(ptrs); ++j) { |
| /* We require that the table is loaded. If not, bail, but unlike |
| * failing to find section headers, make this a fatal error: better |
| * to notify the user than try to run the rseq w/o proper handling. |
| */ |
| if (ptrs < (byte **)ma->start || ptrs > (byte **)ma->end) { |
| REPORT_FATAL_ERROR_AND_EXIT( |
| RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), |
| get_application_pid(), |
| RSEQ_PTR_ARRAY_SEC_NAME " is not in a loaded segment"); |
| ASSERT_NOT_REACHED(); |
| } |
| /* We assume this is a full mapping and it's safe to read the data |
| * (a partial map shouldn't make it to module list processing). |
| * We do perform a sanity check to handle unusual non-relocated |
| * cases (it's possible this array is not in a loaded segment?). |
| */ |
| byte *entry = *ptrs + entry_offs; |
| if (entry < ma->start || entry > ma->end) { |
| REPORT_FATAL_ERROR_AND_EXIT( |
| RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), |
| get_application_pid(), |
| RSEQ_PTR_ARRAY_SEC_NAME "'s entries are not in a loaded segment"); |
| ASSERT_NOT_REACHED(); |
| } |
| rseq_process_entry((struct rseq_cs *)entry, entry_offs); |
| ++ptrs; |
| } |
| break; |
| } |
| ++sec_hdr; |
| } |
| if (!found_array) { |
| sec_hdr = sec_hdr_start; |
| for (i = 0; i < elf_hdr->e_shnum; i++) { |
| #define RSEQ_SEC_NAME "__rseq_cs" |
| #define RSEQ_OLD_SEC_NAME "__rseq_table" |
| if (strcmp(strtab + sec_hdr->sh_name, RSEQ_SEC_NAME) == 0 || |
| strcmp(strtab + sec_hdr->sh_name, RSEQ_OLD_SEC_NAME) == 0) { |
| /* There may be padding at the start of the section, so ensure we skip |
| * over it. We're reading the loaded data, not the file, so it will |
| * always be aligned. |
| */ |
| #define RSEQ_CS_ALIGNMENT (4 * sizeof(__u64)) |
| struct rseq_cs *array = (struct rseq_cs *)ALIGN_FORWARD( |
| sec_hdr->sh_addr + load_offs, RSEQ_CS_ALIGNMENT); |
| int j; |
| for (j = 0; j < sec_hdr->sh_size / sizeof(*array); ++j) { |
| /* We require that the table is loaded. If not, bail. */ |
| if (array < (struct rseq_cs *)ma->start || |
| array > (struct rseq_cs *)ma->end) { |
| REPORT_FATAL_ERROR_AND_EXIT( |
| RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), |
| get_application_pid(), |
| RSEQ_SEC_NAME " is not in a loaded segment"); |
| ASSERT_NOT_REACHED(); |
| } |
| rseq_process_entry(array, entry_offs); |
| ++array; |
| } |
| break; |
| } |
| ++sec_hdr; |
| } |
| } |
| } |
| |
| /* Returns whether successfully searched for rseq data (not whether found rseq data). */ |
| static bool |
| rseq_process_module(module_area_t *ma, bool at_map) |
| { |
| bool res = false; |
| ASSERT(is_elf_so_header(ma->start, ma->end - ma->start)); |
| ELF_HEADER_TYPE *elf_hdr = (ELF_HEADER_TYPE *)ma->start; |
| ASSERT(elf_hdr->e_shentsize == sizeof(ELF_SECTION_HEADER_TYPE)); |
| int fd = INVALID_FILE; |
| byte *sec_map = NULL, *str_map = NULL; |
| size_t sec_size = 0, str_size = 0; |
| ELF_SECTION_HEADER_TYPE *sec_hdr = NULL; |
| char *strtab; |
| ssize_t load_offs = ma->start - ma->os_data.base_address; |
| if (at_map && elf_hdr->e_shoff + ma->start < ma->end) { |
| sec_map = elf_hdr->e_shoff + ma->start; |
| sec_hdr = (ELF_SECTION_HEADER_TYPE *)sec_map; |
| /* We assume strtab is there too. */ |
| strtab = (char *)(ma->start + sec_hdr[elf_hdr->e_shstrndx].sh_offset); |
| if (strtab > (char *)ma->end) |
| goto rseq_process_module_cleanup; |
| } else { |
| /* The section headers are not mapped in. Unfortunately this is the common |
| * case: they are typically at the end of the file. For this reason, we delay |
| * calling this function until we see the app use rseq. |
| */ |
| if (ma->full_path == NULL) |
| goto rseq_process_module_cleanup; |
| fd = os_open(ma->full_path, OS_OPEN_READ); |
| if (fd == INVALID_FILE) |
| goto rseq_process_module_cleanup; |
| off_t offs = ALIGN_BACKWARD(elf_hdr->e_shoff, PAGE_SIZE); |
| sec_size = |
| ALIGN_FORWARD(elf_hdr->e_shoff + elf_hdr->e_shnum * elf_hdr->e_shentsize, |
| PAGE_SIZE) - |
| offs; |
| sec_map = |
| os_map_file(fd, &sec_size, offs, NULL, MEMPROT_READ, MAP_FILE_COPY_ON_WRITE); |
| if (sec_map == NULL) |
| goto rseq_process_module_cleanup; |
| sec_hdr = (ELF_SECTION_HEADER_TYPE *)(sec_map + elf_hdr->e_shoff - offs); |
| /* We also need the section header string table. */ |
| offs = ALIGN_BACKWARD(sec_hdr[elf_hdr->e_shstrndx].sh_offset, PAGE_SIZE); |
| str_size = ALIGN_FORWARD(sec_hdr[elf_hdr->e_shstrndx].sh_offset + |
| sec_hdr[elf_hdr->e_shstrndx].sh_size, |
| PAGE_SIZE) - |
| offs; |
| str_map = |
| os_map_file(fd, &str_size, offs, NULL, MEMPROT_READ, MAP_FILE_COPY_ON_WRITE); |
| if (str_map == NULL) |
| goto rseq_process_module_cleanup; |
| strtab = (char *)(str_map + sec_hdr[elf_hdr->e_shstrndx].sh_offset - offs); |
| } |
| rseq_process_elf_sections(ma, at_map, sec_hdr, strtab, load_offs); |
| res = true; |
| rseq_process_module_cleanup: |
| if (str_size != 0) |
| os_unmap_file(str_map, str_size); |
| if (sec_size != 0) |
| os_unmap_file(sec_map, sec_size); |
| if (fd != INVALID_FILE) |
| os_close(fd); |
| DODEBUG({ |
| if (!res) { |
| const char *name = GET_MODULE_NAME(&ma->names); |
| if (name == NULL) |
| name = "(null)"; |
| LOG(GLOBAL, LOG_INTERP | LOG_VMAREAS, 2, |
| "%s: error looking for rseq table in %s\n", __FUNCTION__, name); |
| if (strstr(name, "linux-vdso.so") == NULL) { |
| SYSLOG_INTERNAL_WARNING_ONCE( |
| "Failed to identify whether a module has an rseq table"); |
| } |
| } |
| }); |
| return res; |
| } |
| |
| static int |
| rseq_locate_tls_offset(void) |
| { |
| /* We assume (and document) that the loader's static TLS is used, so every thread |
| * has a consistent %fs:-offs address. Unfortunately, using a local copy of the |
| * rseq code for our non-instrumented execution requires us to locate the app's |
| * struct using heuristics, because the system call was poorly designed and will not |
| * let us replace the app's. Alternatives of no local copy have worse problems. |
| */ |
| /* Static TLS is at a negative offset from the app library segment base. We simply |
| * search all possible aligned slots. Typically there are <64 possible slots. |
| */ |
| int offset = 0; |
| byte *addr = get_app_segment_base(LIB_SEG_TLS); |
| byte *seg_bottom; |
| if (addr > 0 && get_memory_info(addr, &seg_bottom, NULL, NULL)) { |
| LOG(GLOBAL, LOG_LOADER, 3, "rseq within static TLS " PFX " - " PFX "\n", |
| seg_bottom, addr); |
| /* struct rseq_cs is aligned to 32. */ |
| int alignment = __alignof(struct rseq_cs); |
| int i; |
| for (i = 0; addr - i * alignment >= seg_bottom; i++) { |
| byte *try_addr = addr - i * alignment; |
| ASSERT(try_addr >= seg_bottom); /* For loop guarantees this. */ |
| /* Our strategy is to check all of the aligned static TLS addresses to |
| * find the registered one. Our caller is not supposed to call here |
| * until the app has registered the current thread. |
| */ |
| static const int RSEQ_RARE_SIGNATURE = 42; |
| int res = dynamorio_syscall(SYS_rseq, 4, try_addr, sizeof(struct rseq), |
| RSEQ_FLAG_UNREGISTER, RSEQ_RARE_SIGNATURE); |
| LOG(GLOBAL, LOG_LOADER, 3, "Tried rseq @ " PFX " => %d\n", try_addr, res); |
| if (res == -EINVAL) /* Our struct != registered struct. */ |
| continue; |
| /* We expect -EPERM on a signature mismatch. On the small chance the app |
| * actually used 42 for its signature we'll have to re-register it. |
| */ |
| if (res == 0) { |
| int res = dynamorio_syscall(SYS_rseq, 4, try_addr, sizeof(struct rseq), 0, |
| RSEQ_RARE_SIGNATURE); |
| ASSERT(res == 0); |
| res = -EPERM; |
| } |
| if (res == -EPERM) { |
| /* Found it! */ |
| LOG(GLOBAL, LOG_LOADER, 2, |
| "Found struct rseq @ " PFX " for thread => %s:-0x%x\n", try_addr, |
| get_register_name(LIB_SEG_TLS), i * alignment); |
| offset = -i * alignment; |
| } |
| break; |
| } |
| } |
| return offset; |
| } |
| |
| void |
| rseq_process_syscall(dcontext_t *dcontext) |
| { |
| byte *seg_base = get_app_segment_base(LIB_SEG_TLS); |
| byte *app_addr = (byte *)dcontext->sys_param0; |
| bool constant_offset = false; |
| if (rseq_tls_offset == 0) { |
| SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT); |
| int offset = app_addr - seg_base; |
| /* To handle races here, we use an atomic_exchange. */ |
| int prior = atomic_exchange_int(&rseq_tls_offset, offset); |
| SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT); |
| constant_offset = (prior == 0 || prior == offset); |
| LOG(GLOBAL, LOG_LOADER, 2, |
| "Observed struct rseq @ " PFX " for thread => %s:-0x%x\n", app_addr, |
| get_register_name(LIB_SEG_TLS), -rseq_tls_offset); |
| } else |
| constant_offset = (seg_base + rseq_tls_offset == app_addr); |
| if (!constant_offset) { |
| REPORT_FATAL_ERROR_AND_EXIT( |
| RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), get_application_pid(), |
| "struct rseq is not always in static thread-local storage"); |
| ASSERT_NOT_REACHED(); |
| } |
| } |
| |
| /* Restartable sequence region identification. |
| * |
| * To avoid extra overhead going to disk to read section headers, we delay looking |
| * for rseq data until the app invokes an rseq syscall (or on attach we see a thread |
| * that has rseq set up). We document that we do not handle the app using rseq |
| * regions for non-rseq purposes, so we do not need to flush the cache here. |
| * Since we also identify the rseq_cs address here, this should be called *after* |
| * the app has registered the current thread for rseq. |
| */ |
| void |
| rseq_locate_rseq_regions(void) |
| { |
| if (rseq_enabled) |
| return; |
| /* This is a global operation, but the trigger could be hit by two threads at once, |
| * thus requiring synchronization. |
| */ |
| d_r_mutex_lock(&rseq_trigger_lock); |
| if (rseq_enabled) { |
| d_r_mutex_unlock(&rseq_trigger_lock); |
| return; |
| } |
| |
| int offset = 0; |
| if (rseq_tls_offset == 0) { |
| /* Identify the TLS offset of this thread's struct rseq. */ |
| offset = rseq_locate_tls_offset(); |
| if (offset == 0) { |
| REPORT_FATAL_ERROR_AND_EXIT( |
| RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), |
| get_application_pid(), |
| "struct rseq is not in static thread-local storage"); |
| ASSERT_NOT_REACHED(); |
| } |
| } |
| |
| SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT); |
| bool new_value = true; |
| ATOMIC_1BYTE_WRITE(&rseq_enabled, new_value, false); |
| if (rseq_tls_offset == 0) |
| ATOMIC_4BYTE_WRITE(&rseq_tls_offset, offset, false); |
| SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT); |
| |
| module_iterator_t *iter = module_iterator_start(); |
| while (module_iterator_hasnext(iter)) { |
| module_area_t *ma = module_iterator_next(iter); |
| rseq_process_module(ma, false /*!at_map*/); |
| } |
| module_iterator_stop(iter); |
| d_r_mutex_unlock(&rseq_trigger_lock); |
| } |
| |
| void |
| rseq_module_init(module_area_t *ma, bool at_map) |
| { |
| if (rseq_enabled) { |
| rseq_process_module(ma, at_map); |
| } |
| } |
| |
| void |
| rseq_process_native_abort(dcontext_t *dcontext) |
| { |
| #ifdef CLIENT_INTERFACE |
| /* Raise a transfer event. */ |
| LOG(THREAD, LOG_INTERP | LOG_VMAREAS, 2, "Abort triggered in rseq native code\n"); |
| get_mcontext(dcontext)->pc = dcontext->next_tag; |
| if (instrument_kernel_xfer(dcontext, DR_XFER_RSEQ_ABORT, osc_empty, |
| /* We do not know the source PC so we do not |
| * supply a source state. |
| */ |
| NULL, NULL, dcontext->next_tag, |
| get_mcontext(dcontext)->xsp, osc_empty, |
| get_mcontext(dcontext), 0)) { |
| dcontext->next_tag = canonicalize_pc_target(dcontext, get_mcontext(dcontext)->pc); |
| } |
| #endif |
| } |