blob: 1db0fa545bd7bb3c6ca5e365c921008801dfe116 [file] [log] [blame]
/* **********************************************************
* Copyright (c) 2014-2024 Google, Inc. All rights reserved.
* **********************************************************/
/*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of Google, Inc. nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
/* DRSyms DynamoRIO Extension */
/* Symbol lookup routines for Mach-O */
#include "dr_api.h"
#include "drsyms.h"
#include "drsyms_private.h"
#include "drsyms_obj.h"
#include "hashtable.h"
#include "dwarf.h"
#include "libdwarf.h"
#include <mach-o/loader.h> /* mach_header */
#include <mach-o/nlist.h>
#include <mach-o/fat.h>
#include <string.h>
#include <sys/stat.h>
#include <unistd.h>
#include <errno.h>
#include <limits.h>
#ifndef MIN
# define MIN(x, y) ((x) <= (y) ? (x) : (y))
#endif
#ifndef MAX
# define MAX(x, y) ((x) >= (y) ? (x) : (y))
#endif
#ifndef SIZE_T_MAX
# ifdef X64
# define SIZE_T_MAX ULLONG_MAX
# else
# define SIZE_T_MAX UINT_MAX
# endif
#endif
static uint verbose = 0;
#undef NOTIFY
#define NOTIFY(n, ...) \
do { \
if (verbose >= (n)) { \
dr_fprintf(STDERR, __VA_ARGS__); \
} \
} while (0)
#define NOTIFY_DWARF(de) \
do { \
if (verbose) { \
dr_fprintf(STDERR, "drsyms: Dwarf error: %s\n", dwarf_errmsg(de)); \
} \
} while (0)
/* XXX i#1345: support mixed-mode 32-bit and 64-bit in one process.
* There is no official support for that on Linux or Windows and for now we do
* not support it either, especially not mixing libraries.
*/
#ifdef X64
typedef struct mach_header_64 mach_header_t;
typedef struct segment_command_64 segment_command_t;
typedef struct section_64 section_t;
typedef struct nlist_64 nlist_t;
#else
typedef struct mach_header mach_header_t;
typedef struct segment_command segment_command_t;
typedef struct section section_t;
typedef struct nlist nlist_t;
#endif
typedef struct _macho_info_t {
byte *map_base;
ptr_uint_t load_base;
size_t load_size;
drsym_debug_kind_t debug_kind;
nlist_t *syms;
uint num_syms;
byte *strtab;
uint strsz;
/* Amount to adjust all offsets for __PAGEZERO + PIE (i#1365) */
ssize_t offs_adjust;
/* Used for locating dSYM symbols */
uint8_t uuid[16];
char dsym_path[MAXIMUM_PATH];
/* Since we have no symbol sizes, we sort the symbol table to get
* at least the value of the next entry.
*/
nlist_t **sorted_syms;
uint sorted_count;
} macho_info_t;
/******************************************************************************
* Internal routines
*/
static bool
is_macho_header(byte *base)
{
mach_header_t *hdr = (mach_header_t *)base;
/* We deliberately don't check hdr->filetype as we don't want to limit
* ourselves to just MH_EXECUTE, MH_DYLIB, or MH_BUNDLE in case others
* have symbols as well.
*/
#ifdef X64
return (hdr->magic == MH_MAGIC_64 && hdr->cputype == CPU_TYPE_X86_64);
#else
return (hdr->magic == MH_MAGIC && hdr->cputype == CPU_TYPE_X86);
#endif
}
static bool
is_fat_header(byte *base)
{
struct fat_header *hdr = (struct fat_header *)base;
/* all fields are big-endian */
return (hdr->magic == FAT_CIGAM);
}
/* Handle a universal or "fat" binary by locating the piece we want inside it */
static byte *
find_macho_in_fat_binary(byte *base, size_t *arch_size)
{
struct fat_header *hdr = (struct fat_header *)base;
struct fat_arch *arch;
uint num, i;
/* all fields are big-endian */
if (hdr->magic != FAT_CIGAM)
return NULL;
num = OSSwapInt32(hdr->nfat_arch);
for (i = 0; i < num; i++) {
arch = ((struct fat_arch *)(hdr + 1)) + i;
if (OSSwapInt32(arch->cputype) == IF_X64_ELSE(CPU_TYPE_X86_64, CPU_TYPE_X86)) {
if (arch_size != NULL)
*arch_size = OSSwapInt32(arch->size);
return base + OSSwapInt32(arch->offset);
}
}
return NULL;
}
static byte *
find_macho_header(byte *map_base)
{
byte *arch_base = map_base;
if (is_fat_header(map_base)) {
arch_base = find_macho_in_fat_binary(map_base, NULL);
if (arch_base == NULL)
return NULL;
}
if (!is_macho_header(arch_base))
return NULL;
return arch_base;
}
/* Iterates the program headers for a Mach-O object and returns the minimum
* segment load address. For executables this is generally a well-known
* address. For PIC shared libraries this is usually 0. For DR clients this is
* the preferred load address. If we find no loadable sections, we return zero
* also.
*/
static ptr_uint_t
find_load_base(byte *map_base, size_t *load_size)
{
mach_header_t *hdr = (mach_header_t *)map_base;
struct load_command *cmd, *cmd_stop;
ptr_uint_t load_base = 0, load_end = 0;
bool found_seg = false;
if (!is_macho_header(map_base))
return 0;
cmd = (struct load_command *)(hdr + 1);
cmd_stop = (struct load_command *)((byte *)cmd + hdr->sizeofcmds);
while (cmd < cmd_stop) {
if (cmd->cmd == LC_SEGMENT || cmd->cmd == LC_SEGMENT_64) {
segment_command_t *seg = (segment_command_t *)cmd;
if (!found_seg) {
found_seg = true;
load_base = seg->vmaddr;
load_end = seg->vmaddr + seg->vmsize;
} else {
load_base = MIN(load_base, seg->vmaddr);
load_end = MAX(load_end, seg->vmaddr + seg->vmsize);
}
}
cmd = (struct load_command *)((byte *)cmd + cmd->cmdsize);
}
if (load_size != NULL)
*load_size = (load_end - load_base);
return load_base;
}
/* The LC_UUID section is present in both modules and dSYM files, so we
* use it to match them up.
*/
static bool
drsym_macho_uuids_match(macho_info_t *mod, const char *path)
{
bool match = false;
mach_header_t *hdr;
struct load_command *cmd, *cmd_stop;
byte *map_base = NULL, *arch_base;
uint64 file_size;
size_t map_size;
file_t fd = dr_open_file(path, DR_FILE_READ);
if (fd == INVALID_FILE) {
NOTIFY(1, "%s: unable to open %s\n", __FUNCTION__, path);
goto uuid_error;
}
if (!dr_file_size(fd, &file_size)) {
NOTIFY(1, "%s: unable to get file size %s\n", __FUNCTION__, path);
goto uuid_error;
}
map_size = file_size;
map_base = dr_map_file(fd, &map_size, 0, NULL, DR_MEMPROT_READ, DR_MAP_PRIVATE);
if (map_base == NULL || map_size < file_size) {
NOTIFY(1, "%s: unable to map %s\n", __FUNCTION__, path);
goto uuid_error;
}
arch_base = find_macho_header(map_base);
if (arch_base == NULL) {
NOTIFY(1, "%s: did not find Mach-O header in %s\n", __FUNCTION__, path);
goto uuid_error;
}
hdr = (mach_header_t *)arch_base;
cmd = (struct load_command *)(hdr + 1);
cmd_stop = (struct load_command *)((char *)cmd + hdr->sizeofcmds);
while (cmd < cmd_stop) {
if (cmd->cmd == LC_UUID) {
match = (memcmp(mod->uuid, ((struct uuid_command *)cmd)->uuid,
sizeof(mod->uuid)) == 0);
NOTIFY(2, "%s: uuid %s for %s\n", __FUNCTION__,
match ? "matches" : "does NOT match", path);
break;
}
}
uuid_error:
if (map_base != NULL)
dr_unmap_file(map_base, map_size);
if (fd != INVALID_FILE)
dr_close_file(fd);
return match;
}
static int
compare_symbols(const void *a_in, const void *b_in)
{
const nlist_t *a = *(const nlist_t **)a_in;
const nlist_t *b = *(const nlist_t **)b_in;
if (a->n_value > b->n_value)
return 1;
if (a->n_value < b->n_value)
return -1;
return 0;
}
/* Creates a sorted array of IMAGE_SYMBOL* entries that we can use for address lookup
* and for simpler iteration with no gaps from aux entries
*/
static void
drsym_macho_sort_symbols(macho_info_t *mod)
{
nlist_t *sym = mod->syms;
uint i;
/* There seem to be duplicate entries for some symbols. Since the
* Mach-O type values don't all seem to be documented, I don't trust
* blindly ruling out a certain type under the assumption it will
* have another entry with the same name. Thus I use a hashtable to
* de-dup.
*/
hashtable_t dup_table;
uint hash_bits = 1;
for (i = mod->num_syms; i > 1; i >>= 1)
hash_bits++;
hashtable_init_ex(&dup_table, hash_bits, HASH_STRING, false /*!strdup*/,
false /*!synch*/, NULL, NULL, NULL);
/* We throw out all symbols with zero value, but we don't bother
* to do two passes or a realloc to save memory.
*/
mod->sorted_syms =
(nlist_t **)dr_global_alloc(mod->num_syms * sizeof(*mod->sorted_syms));
mod->sorted_count = 0;
for (i = 0; i < mod->num_syms; i++, sym++) {
/* Rule out value==0 and empty names */
if (sym->n_value == 0)
continue;
if (sym->n_un.n_strx == 0 || sym->n_un.n_strx >= mod->strsz)
continue;
if ((sym->n_type & N_TYPE) == N_UNDF)
continue;
if ((sym->n_type & N_TYPE) == N_INDR) {
/* TODO i#4081: The value for an indirect symbol is the string table
* index for the name of the other symbol it points to. We should
* add an entry for that other name with the value of the target.
* Until we put in a scheme to do that, we skip them.
*/
continue;
}
const char *name = (const char *)(mod->strtab + sym->n_un.n_strx);
/* There are symbols with empty strings inside strtab.
* We could probably rule out by checking the type or sthg
* but this will do as well.
*/
if (name[0] != '\0' && hashtable_lookup(&dup_table, (void *)name) == NULL) {
hashtable_add(&dup_table, (void *)name, (void *)name);
mod->sorted_syms[mod->sorted_count++] = sym;
}
}
hashtable_delete(&dup_table);
/* XXX: using libc qsort. libelftoolchain is already using libc, and
* qsort is in-place and self-contained, so it should be fine.
*/
qsort(mod->sorted_syms, mod->sorted_count, sizeof(nlist_t *), compare_symbols);
if (verbose >= 3) {
NOTIFY(3, "%s:\n", __FUNCTION__);
for (i = 0; i < mod->sorted_count; i++) {
sym = mod->sorted_syms[i];
NOTIFY(3, " #%d: %-20s val=0x%x\n", i, drsym_obj_symbol_name(mod, i),
sym->n_value);
}
}
}
/******************************************************************************
* Mach-O interface to drsyms_unix.c
*/
void
drsym_obj_init(void)
{
/* nothing */
}
void *
drsym_obj_mod_init_pre(byte *map_base, size_t map_size)
{
macho_info_t *mod;
mach_header_t *hdr;
struct load_command *cmd, *cmd_stop;
byte *arch_base;
mod = dr_global_alloc(sizeof(*mod));
memset(mod, 0, sizeof(*mod));
arch_base = find_macho_header(map_base);
if (arch_base == NULL)
return NULL;
mod->map_base = arch_base;
hdr = (mach_header_t *)arch_base;
cmd = (struct load_command *)(hdr + 1);
cmd_stop = (struct load_command *)((char *)cmd + hdr->sizeofcmds);
while (cmd < cmd_stop) {
if (cmd->cmd == LC_SEGMENT || cmd->cmd == LC_SEGMENT_64) {
segment_command_t *seg = (segment_command_t *)cmd;
section_t *sec, *sec_stop;
sec_stop = (section_t *)((char *)seg + seg->cmdsize);
sec = (section_t *)(seg + 1);
while (sec < sec_stop) {
/* sectname is not null-terminated if it's the full 16-char size */
if (strncmp(sec->sectname, "__debug_line", sizeof(sec->sectname)) == 0)
mod->debug_kind |= DRSYM_LINE_NUMS | DRSYM_DWARF_LINE;
sec++;
}
if (strcmp(seg->segname, "__PAGEZERO") == 0 && seg->initprot == 0) {
/* i#1365: any PIE shift is placed after this, and DR skips it
* for the effective base, so we need to adjust our offsets.
*/
mod->offs_adjust = seg->vmsize;
}
} else if (cmd->cmd == LC_SYMTAB) {
/* even if stripped, dynamic symbols are in this table */
struct symtab_command *symtab = (struct symtab_command *)cmd;
mod->debug_kind |= DRSYM_SYMBOLS | DRSYM_MACHO_SYMTAB;
mod->syms = (nlist_t *)(arch_base + symtab->symoff);
mod->num_syms = symtab->nsyms;
mod->strtab = arch_base + symtab->stroff;
mod->strsz = symtab->strsize;
} else if (cmd->cmd == LC_UUID) {
memcpy(mod->uuid, ((struct uuid_command *)cmd)->uuid, sizeof(mod->uuid));
}
cmd = (struct load_command *)((char *)cmd + cmd->cmdsize);
}
return (void *)mod;
}
bool
drsym_obj_mod_init_post(void *mod_in, byte *map_base, void *dwarf_info)
{
macho_info_t *mod = (macho_info_t *)mod_in;
/* we ignore map_base, esp for fat binaries */
mod->load_base = find_load_base(mod->map_base, &mod->load_size);
drsym_macho_sort_symbols(mod);
if (dwarf_info != NULL)
drsym_dwarf_set_obj_offs(dwarf_info, mod->offs_adjust);
return true;
}
bool
drsym_obj_dwarf_init(void *mod_in, dwarf_lib_handle_t *dbg)
{
macho_info_t *mod = (macho_info_t *)mod_in;
Dwarf_Error de; /* expensive to init (DrM#1770) */
if (mod == NULL)
return false;
if (dwarf_macho_init(mod->map_base, DW_DLC_READ, NULL, NULL, dbg, &de) != DW_DLV_OK) {
NOTIFY_DWARF(de);
return false;
}
return true;
}
void
drsym_obj_mod_exit(void *mod_in)
{
macho_info_t *mod = (macho_info_t *)mod_in;
if (mod == NULL)
return;
if (mod->sorted_syms != NULL)
dr_global_free(mod->sorted_syms, mod->num_syms * sizeof(*mod->sorted_syms));
dr_global_free(mod, sizeof(*mod));
}
drsym_debug_kind_t
drsym_obj_info_avail(void *mod_in)
{
macho_info_t *mod = (macho_info_t *)mod_in;
return mod->debug_kind;
}
byte *
drsym_obj_load_base(void *mod_in)
{
macho_info_t *mod = (macho_info_t *)mod_in;
return (byte *)mod->load_base;
}
const char *
drsym_obj_debuglink_section(void *mod_in, const char *modpath)
{
/* The Mac equivalent of the GNU debuglink is the dSYM directory.
* The dsymutil tool, when given binary foo, creates
* foo.dSYM/Contents/Resources/DWARF/foo. However, there is no
* section in the original foo that names its corresponding dSYM.
* There is an LC_UUID that can be used to match them up.
*/
macho_info_t *mod = (macho_info_t *)mod_in;
const char *basename = NULL, *s;
for (s = modpath; *s != '\0'; s++) {
if (*s == '/')
basename = s;
}
if (basename == NULL)
return NULL;
else
basename++; /* skip slash */
/* 1. Check foo.dSYM/Contents/Resources/DWARF/foo */
dr_snprintf(mod->dsym_path, MAXIMUM_PATH, "%s.dSYM/Contents/Resources/DWARF/%s",
modpath, basename);
mod->dsym_path[MAXIMUM_PATH - 1] = '\0';
NOTIFY(2, "%s: looking for %s\n", __FUNCTION__, mod->dsym_path);
if (dr_file_exists(mod->dsym_path) && drsym_macho_uuids_match(mod, mod->dsym_path)) {
/* There's no reason to cache this as this routine is only called
* once, but if it ends up called multiple times we should cache.
*/
return mod->dsym_path;
}
/* XXX: search other standard places */
return NULL;
}
uint
drsym_obj_num_symbols(void *mod_in)
{
macho_info_t *mod = (macho_info_t *)mod_in;
if (mod == NULL)
return 0;
return mod->sorted_count;
}
const char *
drsym_obj_symbol_name(void *mod_in, uint idx)
{
macho_info_t *mod = (macho_info_t *)mod_in;
nlist_t *sym;
if (mod == NULL || idx >= mod->sorted_count || mod->sorted_syms == NULL)
return NULL;
sym = mod->sorted_syms[idx];
if (sym->n_un.n_strx == 0)
return "";
else if (sym->n_un.n_strx >= mod->strsz) /* error: bad index */
return "";
else {
/* we trust it won't run off end of strtab */
const char *name = (const char *)(mod->strtab + sym->n_un.n_strx);
#ifdef MACOS
if (name[0] == '_') {
/* Mach-O symbol tables seem to always have an extra underscore. */
return name + 1;
}
#endif
return name;
}
}
drsym_error_t
drsym_obj_symbol_offs(void *mod_in, uint idx, size_t *offs_start DR_PARAM_OUT,
size_t *offs_end DR_PARAM_OUT)
{
macho_info_t *mod = (macho_info_t *)mod_in;
nlist_t *sym;
if (offs_start == NULL || mod == NULL || idx >= mod->sorted_count ||
mod->sorted_syms == NULL)
return DRSYM_ERROR_INVALID_PARAMETER;
/* We removed all the symbols with value==0 when we sorted */
sym = mod->sorted_syms[idx];
*offs_start = sym->n_value - mod->load_base - mod->offs_adjust;
if (offs_end != NULL) {
/* XXX: the Mach-O nlist struct doesn't store the size
* so, like PECOFF, we use the next symbol's start as the size
* and we document that it isn't precise.
*/
if (idx + 1 < mod->sorted_count) {
*offs_end =
mod->sorted_syms[idx + 1]->n_value - mod->load_base - mod->offs_adjust;
} else
*offs_end = *offs_start + 1;
}
return DRSYM_SUCCESS;
}
drsym_error_t
drsym_obj_addrsearch_symtab(void *mod_in, size_t modoffs, uint *idx DR_PARAM_OUT)
{
macho_info_t *mod = (macho_info_t *)mod_in;
uint min = 0;
uint max = mod->sorted_count - 1;
int min_lower = -1;
drsym_error_t res;
if (mod == NULL || mod->sorted_syms == NULL || idx == NULL)
return DRSYM_ERROR_INVALID_PARAMETER;
/* since we have no symbol sizes, we have to rule this out up front */
if (modoffs >= mod->load_size)
return DRSYM_ERROR_SYMBOL_NOT_FOUND;
/* binary search */
/* XXX: share code with drsyms_pecoff.c */
NOTIFY(1, "%s: 0x%x\n", __FUNCTION__, modoffs);
while (max >= min) {
uint i = (min + max) / 2;
size_t symoffs;
/* we ignore unknown sec here and treat all such as 0 at front of array */
res = drsym_obj_symbol_offs(mod, i, &symoffs, NULL);
NOTIFY(2, "\tbinary search %d => 0x%x == %s\n", i, symoffs,
drsym_obj_symbol_name(mod_in, i));
if (res != DRSYM_SUCCESS && res != DRSYM_ERROR_SYMBOL_NOT_FOUND)
return res;
if (modoffs < symoffs) {
max = i - 1;
} else if (modoffs >= symoffs) {
if (max == min || modoffs == symoffs) {
/* found closest sym with offs <= target */
min_lower = i;
break;
} else {
min_lower = i;
min = i + 1;
}
}
}
NOTIFY(2, "\tbinary search => %d\n", min_lower);
if (min_lower > -1) {
/* found closest sym with offs <= target */
*idx = min_lower;
return DRSYM_SUCCESS;
}
return DRSYM_ERROR_SYMBOL_NOT_FOUND;
}
/******************************************************************************
* Unix-specific helpers
*/
/* Returns true if the two paths have the same inode. Returns false if there
* was an error or they are different.
*
* XXX: share this with drsyms_elf.c
*/
bool
drsym_obj_same_file(const char *path1, const char *path2)
{
struct stat stat1;
struct stat stat2;
int r;
r = stat(path1, &stat1);
if (r != 0)
return false;
r = stat(path2, &stat2);
if (r != 0)
return false;
return stat1.st_ino == stat2.st_ino;
}
const char *
drsym_obj_debug_path(void)
{
return "/usr/lib/debug";
}
const char *
drsym_obj_build_id(void *mod_in)
{
/* NYI. Are build id-based dirs used on Mac? */
return NULL;
}