blob: 21ae5d3d6f94913a09e501bee4be7360d9cec622 [file] [log] [blame]
/* **********************************************************
* Copyright (c) 2010-2017 Google, Inc. All rights reserved.
* Copyright (c) 2009-2010 VMware, Inc. All rights reserved.
* **********************************************************/
/* Dr. Memory: the memory debugger
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation;
* version 2.1 of the License, and no later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Library General Public License for more details.
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
* Dr. Heapstat heap profiler
#include "dr_api.h"
#include "drwrap.h"
#include "drsyscall.h"
#include "drsymcache.h"
#include "umbra.h"
#include "drheapstat.h"
#include "alloc.h"
#include "heap.h"
#include "callstack.h"
#include "crypto.h"
#include "staleness.h"
#include "../drmemory/leak.h"
#include "../drmemory/stack.h"
#include "../drmemory/shadow.h"
#include "../drmemory/instru.h"
#include "../drmemory/slowpath.h"
#ifdef MACOS
# error NYI i#1438
#elif defined(LINUX)
# include "sysnum_linux.h"
# include <errno.h>
# define _GNU_SOURCE /* for sched.h */
# include <linux/sched.h> /* for CLONE_VM */
# include <sys/time.h>
# include <signal.h> /* for SIGSEGV */
#include <stddef.h> /* for offsetof */
#ifdef USE_MD5
# define IF_MD5_ELSE(x, y) x
# define IF_MD5_ELSE(x, y) y
/* for sharing data among instrumentation passes */
typedef struct _instru_info_t {
bb_info_t bi;
instr_t *where_dead;
bool flags_dead;
uint instrs_in_bb;
} instru_info_t;
char logsubdir[MAXIMUM_PATH];
file_t f_callstack = INVALID_FILE;
file_t f_snapshot = INVALID_FILE;
file_t f_staleness = INVALID_FILE;
file_t f_nudge = INVALID_FILE; /* PR 502468 - nudge visualization */
static uint num_threads;
/* Counters for time unit intervals */
static int instr_count;
static int byte_count;
static int allocfree_count;
/* For -time_clock, the frequency is -dump_freq*10 milliseconds */
#define TIME_BASE_FREQ 10
/* These are all in milliseconds */
static uint timer_clock;
static uint timer_stale;
static uint timer_real;
/* Needed to compute the current partial snapshot (PR 548013) */
uint64 timestamp_last_snapshot;
static volatile bool sideline_exit;
/* FIXME i#297: DR synchs + terminates our sideline thread prior to
* calling our exit event so we have no chance to clean up its memory.
* DR then asserts about leaks. Using a targeted solution for now.
static tls_util_t *sideline_pt;
static int leak_count;
static int reachable_leak_count;
/* To avoid the code expansion from a clean call in every bb we use
* a shared clean call sequence, entering it with a direct jump
* and exiting with an indirect jump to a stored return point.
static byte *shared_instrcnt_callout;
static byte *shared_code_region;
(PAGE_SIZE + (options.staleness ? (SHARED_SLOWPATH_SIZE) : 0))
/* We serialize snapshots since rare so not costly perf-wise, and this avoids
* needing potentially very large buffers to try and get atomic writes
static void *snapshot_lock;
/* We intercept libc/ntdll allocation routines instead of providing our
* own, for maximum transparency. Both use 8-byte (for 32-bit) headers.
* FIXME: for 64-bit Windows, 8-byte-or-smaller allocs have special headers
* that are only 8 bytes instead of 16?
#ifdef UNIX
/* FIXME: mmap chunks have 2*size headers (xref PR 474912) */
# define HEADER_SIZE sizeof(size_t)
# define HEADER_SIZE 2*sizeof(size_t)
static void reset_clock_timer(void);
static void reset_real_timer(void);
static void event_thread_exit(void *drcontext);
static void
# define STATS_DUMP_FREQ 10000
uint alloc_stack_count;
static uint peaks_detected;
static uint peaks_skipped;
/* PR 465174: share allocation site callstacks.
* This table should only be accessed while holding the lock for
* malloc_table (via malloc_lock(), which we enable via
* alloc_ops.global_lock), which makes the coordinated operations with
* malloc_table atomic.
static hashtable_t alloc_stack_table;
/* Used to check collisions with crc32 */
static hashtable_t alloc_md5_table;
* TLS and CLS
typedef struct _tls_heapstat_t {
char *errbuf; /* buffer for atomic writes */
size_t errbufsz;
# ifdef UNIX
int64 filepos; /* f_callstack file position */
# endif
} tls_heapstat_t;
/* XXX: share w/ syscall_os.h */
#ifdef WINDOWS
# define SYSCALL_NUM_ARG_STORE 6 /* 6 is max on Linux */
static int tls_idx_heapstat = -1;
static void
if (options.dump) {
options.snapshots = 1;
} else {
options.dump_freq = 1;
/* Instr threshold is per 1k, so a max of 2 trillion instrs.
* We could support more and still do 32-bit inlined arith by
* having the callout do a mod.
if (options.time_instrs) {
if (options.dump_freq > UINT_MAX/1000)
usage_error("-dump_freq: value too large", "");
options.dump_freq *= 1000;
/* We count backward */
instr_count = options.dump_freq;
} else if (options.time_bytes) {
if (!options.dump) {
/* only *8 if doing const # snapshots. this is invisible
* to user so no need to talk about in docs: just makes
* initial snapshots more reasonable than starting at 1.
ASSERT(options.dump_freq == 1, "invalid assumption");
options.dump_freq = 8;
/* We count backward */
byte_count = options.dump_freq;
} else if (options.time_allocs) {
/* We count backward */
allocfree_count = options.dump_freq;
static void
drheap_options_init(const char *opstr)
/* set globals */
op_print_stderr = options.use_stderr;
op_verbose_level = options.verbose;
op_pause_at_assert = options.pause_at_assert;
op_pause_via_loop = options.pause_via_loop;
op_ignore_asserts = options.ignore_asserts;
op_use_symcache = options.use_symcache;
/* Snapshot arrays: of size options.snapshots. For !options.dump,
* when the arrays fill, we double the options.dump_freq frequency and
* then replace snapshots that would not have existed if the
* frequency had been the double one from the beginning.
static uint snap_idx;
static uint snap_fills;
/* We keep a linked list of these structs per snapshot.
* One struct per callstack that has non-zero usage in that snapshot.
* This can save a lot of memory versus arrays when there are
* many callstacks and few are present in all snapshots.
* Xref PR 493134.
typedef struct _heap_used_t {
/* FIXME: 64-bit? */
uint instances;
uint bytes_asked_for;
ushort extra_usable; /* beyond bytes_asked_for */
ushort extra_occupied; /* beyond bytes_asked_for + extra_usable */
struct _heap_used_t *next;
per_callstack_t *callstack;
} heap_used_t;
/* Arrays of snapshots. Not using a heap_used_t b/c we need larger counters. */
typedef struct _per_snapshot_t {
uint64 stamp;
uint64 tot_mallocs;
uint64 tot_bytes_asked_for;
uint64 tot_bytes_usable;
uint64 tot_bytes_occupied;
/* Linked list of non-zero usage per callstack */
heap_used_t *used;
/* Staleness data: array with one entry per live malloc */
stale_snap_allocs_t *stale;
} per_snapshot_t;
static uint64 stamp;
/* Used for starting time 0 in middle of run (post-nudge usually) */
static uint64 stamp_offs;
static per_snapshot_t *snaps;
/* peak snapshot (PR 476018) */
static per_snapshot_t snap_peak;
/* track changes in # allocs+frees for PR 566116 */
static uint64 allocfree_cur, allocfree_last_peak;
#define SNAPSHOT_LOG_BUF_SIZE (32*1024)
static char snaps_log_buf[SNAPSHOT_LOG_BUF_SIZE]; /* PR 551841 */
struct _per_callstack_t {
uint id;
#if defined(USE_MD5) || defined(CHECK_WITH_MD5)
/* PR 496304: we keep just the md5 to save memory. We either do so
* instead of crc32, or in addition as a debug check on collisions.
byte md5[MD5_RAW_BYTES];
#ifndef USE_MD5
/* PR 496304: we keep just a checksum to save memory. Since crc32 could
* collide we use crc32 of the whole callstack plus a separate crc32
* of the first half of the callstack.
uint crc[2];
/* for the current snapshot */
heap_used_t *used;
/* for node removal w/o keeping a prev per heap_used_t per snapshot */
heap_used_t *prev_used;
static uint num_callstacks;
static uint snapshot_count;
static uint nudge_count;
get_cstack_id(per_callstack_t *per)
return per->id;
static inline per_callstack_t *
get_cstack_from_alloc_data(void *client_data)
/* We store either per_callstack_t or stale_per_alloc_t in the
* client_data slot in each malloc
if (options.staleness)
return ((stale_per_alloc_t *)client_data)->cstack;
return (per_callstack_t *) client_data;
alloc_callstack_free(void *p)
per_callstack_t *per = (per_callstack_t *) p;
global_free(per, sizeof(*per), HEAPSTAT_CALLSTACK);
client_malloc_data_free(void *data)
/* nothing to do since we persist our callstacks in alloc_stack_table */
void *
client_malloc_data_to_free_list(void *cur_data, dr_mcontext_t *mc, app_pc post_call)
/* nothing to do since we persist our callstacks in alloc_stack_table */
return cur_data;
void *
client_malloc_data_free_split(void *cur_data)
/* Just keep the same data. There's no use-after-free b/c we don't free
* it in client_malloc_data_free().
return cur_data;
static void
get_buffer(void *drcontext, char **buf/*OUT*/, size_t *bufsz/*OUT*/)
tls_heapstat_t *pt = (tls_heapstat_t *)
drmgr_get_tls_field(drcontext, tls_idx_heapstat);
ASSERT(buf != NULL && bufsz != NULL, "invalid args");
if (pt == NULL) {
/* at init time no pt yet */
*bufsz = MAX_ERROR_INITIAL_LINES + max_callstack_size();
*buf = (char *) global_alloc(*bufsz, HEAPSTAT_CALLSTACK);
} else {
*buf = pt->errbuf;
*bufsz = pt->errbufsz;
static void
release_buffer(void *drcontext, char *buf, size_t bufsz)
tls_heapstat_t *pt = (tls_heapstat_t *)
drmgr_get_tls_field(drcontext, tls_idx_heapstat);
if (pt == NULL) {
global_free(buf, bufsz, HEAPSTAT_CALLSTACK);
static const char *
if (options.time_instrs)
return "instrs";
if (options.time_allocs)
return "mallocs";
if (options.time_bytes)
return "bytes";
if (options.time_clock)
return "ticks (10ms each)";
return "<error>";
/* Up to caller to synchronize */
static void
dump_snapshot(per_snapshot_t *snap, int idx/*-1 means peak*/)
heap_used_t *u;
size_t sofar = 0;
ssize_t len = 0;
LOG(2, "dumping snapshot idx=%d count=%"INT64_FORMAT"u\n",
idx, snap->stamp);
dr_fprintf(f_snapshot, "SNAPSHOT #%4d @ %16"INT64_FORMAT"u %s\n",
snapshot_count, snap->stamp + stamp_offs, unit_name());
dr_fprintf(f_snapshot, "idx=%d, stamp_offs=%16"INT64_FORMAT"u\n",
idx, stamp_offs);
dr_fprintf(f_snapshot, "total: %"INT64_FORMAT"u,%"INT64_FORMAT"u,%"
snap->tot_mallocs, snap->tot_bytes_asked_for,
snap->tot_bytes_usable, snap->tot_bytes_occupied);
for (u = snap->used; u != NULL; u = u->next) {
if (u->bytes_asked_for + u->extra_usable > 0) {
/* PR 551841: buffer snapshot output else performance is bad. */
BUFFERED_WRITE(f_snapshot, snaps_log_buf, SNAPSHOT_LOG_BUF_SIZE,
sofar, len, "%u,%u,%u,%u,%u\n",
u->callstack->id, u->instances, u->bytes_asked_for,
u->extra_usable, u->extra_occupied);
FLUSH_BUFFER(f_snapshot, snaps_log_buf, sofar);
if (options.staleness) {
uint i;
dr_fprintf(f_staleness, "SNAPSHOT #%4d @ %16"INT64_FORMAT"u %s\n",
snapshot_count, snap->stamp + stamp_offs, unit_name());
/* FIXME: optimize by listing cstack id only once; or binary format.
* If still too big then collapse similar timestamps and give up
* some runtime flexibility in granularity
sofar = 0;
for (i = 0; snap->stale != NULL && i < snap->stale->num_entries; i++) {
/* PR 551841: improve perf by printing to buffer to reduce # file writes */
BUFFERED_WRITE(f_staleness, snaps_log_buf, SNAPSHOT_LOG_BUF_SIZE,
sofar, len, "%u,%u,%"INT64_FORMAT"u\n",
staleness_get_snap_cstack_id(snap->stale, i),
staleness_get_snap_bytes(snap->stale, i),
staleness_get_snap_last_access(snap->stale, i));
FLUSH_BUFFER(f_staleness, snaps_log_buf, sofar);
/* Caller must hold snapshot_lock */
static void
free_snapshot(per_snapshot_t *snap)
heap_used_t *u, *nxt_u;
for (u = snap->used; u != NULL; u = nxt_u) {
nxt_u = u->next;
global_free(u, sizeof(*u), HEAPSTAT_SNAPSHOT);
snap->used = NULL;
if (options.staleness && snap->stale != NULL) {
snap->stale = NULL;
/* Caller must hold snapshot_lock.
* Calls free_snapshot on dst first.
* If new_live is true, dst is the new "live" in-progress snapshot
* whose list is where callstack table entries point.
static void
copy_snapshot(per_snapshot_t *dst, per_snapshot_t *src, bool new_live)
heap_used_t *u, *nxt_u, *prev_u;
int i;
/* Replace the existing list at dst with a clone of
* src, and update the callstack table pointers. First
* we clear the callstack table pointers in case pointing to
* entries not in the new list.
ASSERT(src != dst, "cannot copy to self");
memcpy(dst, src, sizeof(*dst));
if (options.staleness) {
/* We fill this in at snapshot time */
dst->stale = NULL;
if (new_live) {
for (i = 0; i < HASHTABLE_SIZE(alloc_stack_table.table_bits); i++) {
hash_entry_t *he;
for (he = alloc_stack_table.table[i]; he != NULL; he = he->next) {
per_callstack_t *per = (per_callstack_t *) he->payload;
per->used = NULL;
per->prev_used = NULL;
prev_u = NULL;
for (u = src->used; u != NULL; u = u->next) {
nxt_u = (heap_used_t *) global_alloc(sizeof(*nxt_u), HEAPSTAT_SNAPSHOT);
memcpy(nxt_u, u, sizeof(*nxt_u));
if (prev_u == NULL)
dst->used = nxt_u;
prev_u->next = nxt_u;
if (new_live) {
nxt_u->callstack->used = nxt_u;
nxt_u->callstack->prev_used = prev_u;
nxt_u->next = NULL;
prev_u = nxt_u;
static bool
difference_exceeds_percent(uint64 new_val, uint64 old_val, uint percent)
/* Avoid floating-point via 100*. Assuming no overflow b/c uint64. */
uint64 diff = (new_val > old_val) ? (new_val - old_val) : (old_val - new_val);
return (100 * diff > percent * old_val);
/* If the current snap_idx snapshot is larger than the current peak,
* makes a new peak snapshot (PR 476018).
* Assumes snapshot lock and malloc_lock are held.
static void
if (snaps[snap_idx].tot_bytes_occupied > snap_peak.tot_bytes_occupied) {
/* PR 566116: avoid too-frequent peak snapshots by ignoring if the new
* peak is similar to the existing one, both in size and in malloc
* makeup. May need to split -peak_threshold into 3 if we need
* separate control of each variable.
if (difference_exceeds_percent(snaps[snap_idx].tot_bytes_occupied,
options.peak_threshold) ||
difference_exceeds_percent(allocfree_cur, allocfree_last_peak,
snap_peak.tot_bytes_occupied) ||
/* even if not much different, if it's been a long time, use it */
options.peak_threshold)) {
allocfree_last_peak = allocfree_cur;
copy_snapshot(&snap_peak, &snaps[snap_idx], false/*isolated copy*/);
if (options.staleness) {
/* copy_snapshot called free_snapshot which freed this */
ASSERT(snap_peak.stale == NULL, "invalid staleness data");
snap_peak.stale = staleness_take_snapshot(stamp);
LOG(2, "new peak snapshot, tot occupied=%"INT64_FORMAT"u\n",
} else {
LOG(2, "NOT taking new peak snapshot, tot occupied=%"INT64_FORMAT"u\n",
/* Caller must hold malloc_lock() */
static void
uint prev_idx;
/* We serialize snapshots since rare so not costly perf-wise, and this avoids
* needing potentially very large buffers to try and get atomic writes
if (options.staleness) {
/* Unlike the mem usage data which is maintained as the app
* executes, we have to go collect this at snapshot time from
* the malloc table.
if (options.dump && snaps[snap_idx].stale != NULL) {
snaps[snap_idx].stale = NULL;
ASSERT(snaps[snap_idx].stale == NULL, "invalid staleness data");
snaps[snap_idx].stale = staleness_take_snapshot(stamp);
if (options.dump) {
snaps[snap_idx].stamp += options.dump_freq;
dump_snapshot(&snaps[snap_idx], snap_idx);
} else {
stamp += options.dump_freq;
snaps[snap_idx].stamp = stamp;
prev_idx = snap_idx;
LOG(2, "take_snapshot @idx=%u stamp=%"INT64_FORMAT"u\n", prev_idx, stamp);
/* Check for peak on every snapshot (PR 476018) */
/* Find the next one we should overwrite. Keep those aligned
* w/ current dump_freq.
do {
if (snap_idx >= options.snapshots) {
snap_idx = 0;
options.dump_freq *= 2;
LOG(1, "adjusting snapshots: new freq=%u\n", options.dump_freq);
if (options.time_clock)
} while (snaps[snap_idx].stamp > 0 &&
(snaps[snap_idx].stamp % options.dump_freq) == 0);
/* Replace the existing list at snap_idx with a clone of prev_idx */
copy_snapshot(&snaps[snap_idx], &snaps[prev_idx], true/*live copy*/);
/* Called from pre-alloc-hashtable-change events.
* Updates the current snapshot and callstack usage.
static void
account_for_bytes_pre(per_callstack_t *per, int asked_for,
int extra_usable, int extra_occupied, bool realloc)
/* must be synched w/ take_snapshot(). the malloc lock is always acquired
* before the snapshot lock.
if (asked_for+extra_usable > 0) {
if (per->used == NULL) {
per->used = (heap_used_t *)
global_alloc(sizeof(*per->used), HEAPSTAT_SNAPSHOT);
memset(per->used, 0, sizeof(*per->used));
per->used->callstack = per;
per->used->next = snaps[snap_idx].used;
if (snaps[snap_idx].used != NULL) {
ASSERT(snaps[snap_idx].used->callstack->prev_used == NULL,
"prev_used should already be null");
snaps[snap_idx].used->callstack->prev_used = per->used;
ASSERT(per->prev_used == NULL, "prev_used should already be null");
snaps[snap_idx].used = per->used;
if (!realloc)
} else {
ASSERT(asked_for+extra_usable < 0, "cannot have 0-sized usable space");
ASSERT(per->used != NULL, "alloc must exist");
ASSERT(per->used->instances >= 0, "alloc count must be >= 0");
ASSERT(snaps[snap_idx].tot_mallocs >= 0, "alloc count must be >= 0");
if (!realloc)
per->used->bytes_asked_for += asked_for;
per->used->extra_usable += extra_usable;
per->used->extra_occupied += extra_occupied;
LOG(2, "callstack id %u => %ux, %uB, +%uB, +%uB\n", per->id,
per->used->instances, per->used->bytes_asked_for,
per->used->extra_usable, per->used->extra_occupied);
if (per->used->instances == 0) {
/* remove the node to save memory since may not re-alloc */
ASSERT(per->used->bytes_asked_for == 0, "no malloc => no bytes!");
if (per->used->next != NULL)
per->used->next->callstack->prev_used = per->prev_used;
if (per->prev_used == NULL) {
ASSERT(per->used == snaps[snap_idx].used, "prev node error");
snaps[snap_idx].used = per->used->next;
} else {
per->prev_used->next = per->used->next;
global_free(per->used, sizeof(*per->used), HEAPSTAT_SNAPSHOT);
per->used = NULL;
per->prev_used = NULL;
snaps[snap_idx].tot_bytes_asked_for += asked_for;
snaps[snap_idx].tot_bytes_usable += asked_for + extra_usable;
snaps[snap_idx].tot_bytes_occupied += asked_for + extra_usable + extra_occupied;
/* Called from post-alloc-hashtable-change events which is important to
* have a consistent view in the staleness hashtable walk (PR 567117).
* Updates the -time_allocs and -time_bytes counters and potentially
* takes snapshots.
static void
account_for_bytes_post(int asked_for, int extra_usable, int extra_occupied)
if (options.time_bytes) {
/* PR 545288: consider dealloc as well as alloc */
int diff = asked_for + extra_usable + extra_occupied;
if (diff < 0)
diff = -diff;
if (diff > byte_count) {
/* allocs larger than cur freq occupy multiple snapshots */
while (diff > byte_count) {
diff -= options.dump_freq;
byte_count = options.dump_freq;
} else {
byte_count -= diff;
} else if (options.time_allocs) {
/* PR 545288: consider free as well as alloc */
/* We rely on malloc lock being held by caller */
if (allocfree_count <= 0) {
allocfree_count = options.dump_freq;
static void
dump_callstack(packed_callstack_t *pcs, per_callstack_t *per,
char *buf, size_t bufsz, size_t *sofar)
ssize_t len = 0;
/* we use a buffer for atomic prints even though malloc lock does
* currently synchronize
BUFPRINT(buf, bufsz, *sofar, len, "CALLSTACK %u\n", per->id);
packed_callstack_print(pcs, 0, buf, bufsz, sofar, NULL);
BUFPRINT(buf, bufsz, *sofar, len, "%s", END_MARKER);
print_buffer(f_callstack, buf);
/* A lock is held around the call to this routine */
void *
client_add_malloc_pre(malloc_info_t *info, dr_mcontext_t *mc, app_pc post_call)
void *drcontext = dr_get_current_drcontext();
per_callstack_t *per;
char *buf;
size_t bufsz;
size_t sofar = 0;
static uint malloc_count;
get_buffer(drcontext, &buf, &bufsz);
if (info->client_data != NULL) {
per = get_cstack_from_alloc_data(info->client_data);
(void *)per->IF_MD5_ELSE(md5, crc)) == (void*)per,
"malloc re-add should still be in table");
} else {
#if defined(USE_MD5) || defined(CHECK_WITH_MD5)
byte md5[MD5_RAW_BYTES];
#ifndef USE_MD5
uint crc[2];
/* Printing to a buffer is slow (quite noticeable: 2x on cfrac) so it's
* faster to create a packed callstack for computing the checksum to
* decide uniqueness, limiting printing to new callstacks only.
packed_callstack_t *pcs;
app_loc_t loc;
pc_to_loc(&loc, post_call);
packed_callstack_record(&pcs, mc, &loc, options.callstack_max_frames);
#if defined(USE_MD5) || defined(CHECK_WITH_MD5)
packed_callstack_md5(pcs, md5);
#ifndef USE_MD5
packed_callstack_crc32(pcs, crc);
per = (per_callstack_t *)
hashtable_lookup(&alloc_stack_table, (void *)IF_MD5_ELSE(md5, crc));
/* Check for collisions with crc32 */
ASSERT(per == hashtable_lookup(&alloc_md5_table, (void *)md5),
"crc and md5 do not agree");
if (per == NULL) {
per = (per_callstack_t *) global_alloc(sizeof(*per), HEAPSTAT_CALLSTACK);
memset(per, 0, sizeof(*per));
/* we could do ++ since there's an outer lock */
per->id = atomic_add32_return_sum((volatile int *)&num_callstacks, 1);
#if defined(USE_MD5) || defined(CHECK_WITH_MD5)
memcpy(per->md5, md5, BUFFER_SIZE_BYTES(md5));
hashtable_add(IF_MD5_ELSE(&alloc_stack_table, &alloc_md5_table),
(void *)per->md5, (void *)per);
#ifndef USE_MD5
per->crc[0] = crc[0];
per->crc[1] = crc[1];
hashtable_add(&alloc_stack_table, (void *)per->crc, (void *)per);
dump_callstack(pcs, per, buf, bufsz, &sofar);
sofar = packed_callstack_free(pcs);
ASSERT(sofar == 0, "pcs should have 0 ref count");
#ifdef X64
/* FIXME: assert not truncating */
account_for_bytes_pre(per, info->request_size, info->pad_size, HEADER_SIZE, false);
if (((malloc_count++) % STATS_DUMP_FREQ) == 0)
release_buffer(drcontext, buf, bufsz);
if (options.staleness)
return (void *) staleness_create_per_alloc(per, stamp);
return (void *) per;
client_add_malloc_post(malloc_info_t *info)
/* take potential snapshots here once table is consistent (PR 567117) */
account_for_bytes_post(info->request_size, info->pad_size, HEADER_SIZE);
/* A lock is held around the call to this routine */
client_remove_malloc_pre(malloc_info_t *info)
per_callstack_t *per = get_cstack_from_alloc_data(info->client_data);
#ifdef X64
/* FIXME: assert not truncating */
/* To avoid repeatedly redoing the peak snapshot we wait until a drop (PR 476018) */
account_for_bytes_pre(per, -(ssize_t)info->request_size, -(ssize_t)info->pad_size,
-(ssize_t)(HEADER_SIZE), false);
if (options.staleness)
staleness_free_per_alloc((stale_per_alloc_t *)info->client_data);
client_remove_malloc_post(malloc_info_t *info)
/* take potential snapshots here once table is consistent (PR 567117) */
account_for_bytes_post(info->request_size, info->pad_size, HEADER_SIZE);
static void
snapshot_lock = dr_mutex_create();
snaps = (per_snapshot_t *)
global_alloc(options.snapshots*sizeof(*snaps), HEAPSTAT_SNAPSHOT);
memset(snaps, 0, options.snapshots*sizeof(*snaps));
/* Caller must hold malloc lock */
static void
uint i;
/* These should be sorted by stamp, but simpler to have the vis tool
* sort them.
/* We do dump the partially-full current snapshot (PR 548013) */
if (options.time_clock) {
uint64 diff = ((dr_get_milliseconds() - timestamp_last_snapshot)
/ TIME_BASE_FREQ) + 1 /* round up */;
snaps[snap_idx].stamp = stamp + diff;
} else if (options.time_allocs)
snaps[snap_idx].stamp = stamp + (options.dump_freq - allocfree_count);
else if (options.time_bytes)
snaps[snap_idx].stamp = stamp + (options.dump_freq - byte_count);
else if (options.time_instrs)
snaps[snap_idx].stamp = stamp + (options.dump_freq - instr_count);
/* Check for peak on every snapshot (PR 476018) */
dump_snapshot(&snap_peak, -1);
if (snap_fills == 0) {
for (i = 0; i <= snap_idx; i++) {
dump_snapshot(&snaps[i], i);
} else {
/* FIXME: sort by stamp */
for (i = 0; i < options.snapshots; i++) {
dump_snapshot(&snaps[i], i);
static void
int i;
for (i = 0; i < options.snapshots; i++)
global_free(snaps, options.snapshots*sizeof(*snaps), HEAPSTAT_SNAPSHOT);
client_handle_malloc(void *drcontext, malloc_info_t *info, dr_mcontext_t *mc)
if (options.check_leaks)
leak_handle_alloc(drcontext, info->base, info->request_size);
client_handle_realloc(void *drcontext, malloc_info_t *old_info, malloc_info_t *new_info,
bool for_reuse, dr_mcontext_t *mc)
/* We only need to act on an in-place realloc, as an out-of-place realloc
* will have already called client_{remove,add}_malloc_{pre,post}.
if (new_info->base == old_info->base) {
per_callstack_t *per = get_cstack_from_alloc_data(new_info->client_data);
ssize_t delta_req = (ssize_t)new_info->request_size - old_info->request_size;
ssize_t delta_pad = (ssize_t)new_info->pad_size - old_info->pad_size;
ssize_t delta_head = 0;
if (delta_req < 0) {
/* Just like on a free we check for a drop in the peak */
account_for_bytes_pre(per, delta_req, delta_pad, delta_head, true/*realloc*/);
account_for_bytes_post(delta_req, delta_pad, 0);
if (options.check_leaks)
leak_handle_alloc(drcontext, new_info->base, new_info->request_size);
client_handle_alloc_failure(size_t request_size, app_pc pc, dr_mcontext_t *mc)
client_handle_realloc_null(app_pc pc, dr_mcontext_t *mc)
/* Returns the value to pass to free(). Return "real_base" for no change.
* The Windows heap param is INOUT so it can be changed as well.
client_handle_free(malloc_info_t *info, byte *tofree, dr_mcontext_t *mc,
app_pc free_routine, void *routine_set_data, bool for_reuse
_IF_WINDOWS(ptr_int_t *auxarg INOUT))
return tofree;
client_handle_free_reuse(void *drcontext, malloc_info_t *info, dr_mcontext_t *mc)
/* nothing */
client_new_redzone(app_pc start, size_t size)
/* nothing */
client_invalid_heap_arg(app_pc pc, app_pc target, dr_mcontext_t *mc, const char *routine,
bool is_free)
/* not reporting today */
LOG(1, "invalid heap arg\n");
client_mismatched_heap(app_pc pc, app_pc target, dr_mcontext_t *mc,
const char *alloc_routine, const char *free_routine,
const char *action, void *client_data, bool C_vs_CPP)
/* not reporting today */
LOG(1, "mismatched heap routines\n");
client_handle_mmap(void *drcontext, app_pc base, size_t size, bool anon)
client_handle_munmap(app_pc base, size_t size, bool anon)
client_handle_munmap_fail(app_pc base, size_t size, bool anon)
#ifdef UNIX
client_handle_mremap(app_pc old_base, size_t old_size, app_pc new_base, size_t new_size,
bool image)
void *
client_add_malloc_routine(app_pc pc)
return NULL;
client_remove_malloc_routine(void *client_data)
#ifdef WINDOWS
client_handle_heap_destroy(void *drcontext, HANDLE heap,
void *client_data)
client_remove_malloc_on_destroy(HANDLE heap, byte *start, byte *end)
if (options.check_leaks)
leak_remove_malloc_on_destroy(heap, start, end);
#endif /* WINDOWS */
client_stack_alloc(byte *start, byte *end, bool defined)
client_stack_dealloc(byte *start, byte *end)
client_write_memory(byte *start, size_t size, dr_mcontext_t *mc)
return true;
client_read_memory(byte *start, size_t size, dr_mcontext_t *mc)
return true;
#ifdef DEBUG
client_print_callstack(void *drcontext, dr_mcontext_t *mc, app_pc pc)
print_callstack_to_file(drcontext, mc, pc, f_global, options.callstack_max_frames);
client_pre_syscall(void *drcontext, int sysnum)
client_post_syscall(void *drcontext, int sysnum)
client_found_leak(app_pc start, app_pc end, size_t indirect_bytes,
bool pre_us, bool reachable,
bool maybe_reachable, void *client_data,
bool count_reachable, bool show_reachable)
/* XXX i#926: we now have online symbolization of leak callstacks, but
* we're re-symbolizing in until we can produce a proper
* error report here including suppression checking. We should refactor
* Dr. Memory leak reporting (and checking) into a library for sharing.
per_callstack_t *per = get_cstack_from_alloc_data(client_data);
ssize_t len = 0;
size_t sofar = 0;
char *buf;
size_t bufsz;
void *drcontext = dr_get_current_drcontext();
int num;
ASSERT(options.check_leaks, "leak checking error");
if (pre_us && options.ignore_early_leaks)
if (reachable) {
if (count_reachable)
if (!show_reachable)
if (maybe_reachable && !options.possible_leaks)
num = atomic_add32_return_sum((volatile int *)&leak_count, 1);
get_buffer(drcontext, &buf, &bufsz);
BUFPRINT(buf, bufsz, sofar, len, "Error #%d: ", num);
if (reachable)
BUFPRINT(buf, bufsz, sofar, len, "REACHABLE ");
else if (maybe_reachable)
BUFPRINT(buf, bufsz, sofar, len, "POSSIBLE ");
BUFPRINT(buf, bufsz, sofar, len,
"LEAK %d direct bytes "PFX"-"PFX" + %d indirect bytes"
"\n\tcallstack=%d\n\terror end\n",
(end - start), start, end, indirect_bytes, per->id);
print_buffer(f_global, buf);
release_buffer(drcontext, buf, bufsz);
/* N.B.: mcontext is not in consistent app state, for efficiency.
static void
bool do_snapshot = false;
ASSERT(options.time_instrs, "option mismatch");
/* We racily subtract and check, so serialize now for a single snapshot */
/* We can still have a double-snapshot if threshold is low enough that
* other threads bump the instr_count over it before 2nd guy to get lock
* can do this check, but that's ok
if (instr_count < 0) {
do_snapshot = true;
instr_count = options.dump_freq;
} else {
/* We assume can't take so long that it wraps around */
ASSERT(instr_count <= options.dump_freq, "callee incorrectly invoked");
/* Release lock so take_snapshot can grab it */
if (do_snapshot)
/* To avoid the code expansion from a clean call in every bb we use
* a shared clean call sequence, entering it with a direct jump
* and exiting with an indirect jump to a stored return point.
static app_pc
generate_shared_callout(void *drcontext, instrlist_t *ilist, app_pc pc)
/* On entry:
* - SPILL_SLOT_2 holds the return address
* The spill slots are persistent storage across clean calls.
dr_insert_clean_call(drcontext, ilist, NULL,
(void *) shared_instrcnt_callee, false, 0);
PRE(ilist, NULL,
dr_reg_spill_slot_opnd(drcontext, SPILL_SLOT_2)));
shared_instrcnt_callout = pc;
pc = instrlist_encode(drcontext, ilist, pc, false);
instrlist_clear(drcontext, ilist);
return pc;
static void
void *drcontext = dr_get_current_drcontext();
byte *pc;
IF_DEBUG(bool ok;)
instrlist_t *ilist = instrlist_create(drcontext);
shared_code_region = (byte *)
pc = shared_code_region;
pc = generate_shared_callout(drcontext, ilist, pc);
ASSERT(pc - shared_code_region <= SHARED_CODE_SIZE, "shared code region too small");
if (options.staleness) {
pc = generate_shared_slowpath(drcontext, ilist, pc);
ASSERT(pc - shared_code_region <= SHARED_CODE_SIZE,
"shared code region too small");
if (options.check_leaks) {
pc = generate_shared_esp_slowpath(drcontext, ilist, pc);
ASSERT(pc - shared_code_region <= SHARED_CODE_SIZE,
"shared code region too small");
instrlist_clear_and_destroy(drcontext, ilist);
/* now mark as +rx (non-writable) */
IF_DEBUG(ok = )
dr_memory_protect(shared_code_region, SHARED_CODE_SIZE,
ASSERT(ok, "-w failed on shared routines gencode");
DOLOG(2, {
byte *end_pc = pc;
pc = shared_code_region;
LOG(2, "shared_code region:\n");
while (pc < end_pc) {
pc = disassemble_with_info(drcontext, pc, f_global,
true/*show pc*/, true/*show bytes*/);
static void
nonheap_free(shared_code_region, SHARED_CODE_SIZE, HEAPSTAT_GENCODE);
static void
insert_instr_counter(void *drcontext, instrlist_t *bb,
instr_t *first, bool flags_dead, instr_t *where_dead,
uint instrs_in_bb)
instr_t *where = (where_dead == NULL) ? first : where_dead;
instr_t *done = INSTR_CREATE_label(drcontext);
if (!flags_dead)
dr_save_arith_flags(drcontext, bb, first, SPILL_SLOT_1);
/* Rather than an ongoing count that would need a 64-bit
* counter, we do a racy subtract of a 32-bit counter and if
* negative (so we don't need a cmp) then we go to a callout
* that synchs for a counter reset and single snapshot.
* We assume that racy mods during that synch won't overflow
* the counter. We also ignore the detail of how many instrs
* in this bb we've executed yet.
(bb, where, INSTR_CREATE_sub(drcontext, OPND_CREATE_ABSMEM
((byte *)&instr_count, OPSZ_4),
(instrs_in_bb <= CHAR_MAX) ?
OPND_CREATE_INT8(instrs_in_bb) :
/* TODO: for better perf could not bother to check threshold
* in every bb: but don't want to skip for single-bb-loop-body
(bb, where, INSTR_CREATE_jcc(drcontext, OP_jns_short,
/* To avoid the code expansion from a clean call in every bb we use
* a shared clean call sequence, entering it with a direct jump
* and exiting with an indirect jump to a stored return point.
/* Get return point into SPILL_SLOT_2. Spill reg, mov imm to reg, and then
* xchg reg and slot isn't any faster, right?
(bb, where, INSTR_CREATE_mov_st
(drcontext, dr_reg_spill_slot_opnd(drcontext, SPILL_SLOT_2),
(bb, where, INSTR_CREATE_jmp(drcontext,
/* Will return here */
instrlist_meta_preinsert(bb, where, done);
if (!flags_dead)
dr_restore_arith_flags(drcontext, bb, first, SPILL_SLOT_1);
static dr_emit_flags_t
event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb,
bool for_trace, bool translating, void **user_data OUT)
/* we pass bi among all 4 phases */
instru_info_t *ii = thread_alloc(drcontext, sizeof(*ii), HEAPSTAT_PERBB);
memset(ii, 0, sizeof(*ii));
*user_data = (void *) ii;
static dr_emit_flags_t
event_bb_analysis(void *drcontext, void *tag, instrlist_t *bb,
bool for_trace, bool translating, void *user_data)
instru_info_t *ii = (instru_info_t *) user_data;
DOLOG(3, instrlist_disassemble(drcontext, tag, bb, LOGFILE_GET(drcontext)););
DOLOG(3, {
char buf[128];
size_t sofar = 0;
ssize_t len;
if (!translating) {
BUFPRINT(buf, BUFFER_SIZE_ELEMENTS(buf), sofar, len,
"new basic block @"PFX" ==", tag);
print_symbol(tag, buf, BUFFER_SIZE_ELEMENTS(buf), &sofar, false, 0);
LOG(1, "%s\n", buf);
if (options.staleness)
fastpath_top_of_bb(drcontext, tag, bb, &ii->bi);
static dr_emit_flags_t
event_bb_insert(void *drcontext, void *tag, instrlist_t *bb, instr_t *inst,
bool for_trace, bool translating, void *user_data)
instru_info_t *ii = (instru_info_t *) user_data;
uint flags;
fastpath_info_t mi;
if (instr_is_meta(inst))
/* i#2402: Temporarily disable auto predication globally due to poor
* interaction with internal control flow we emit.
drmgr_disable_auto_predication(drcontext, bb);
if (options.time_instrs) {
/* See if flags are dead anywhere.
* FIXME: for fault handling we should either consider a faultable
* instr as having live flags or recover somehow: ignoring for now
* as pathological.
flags = instr_get_arith_flags(inst, DR_QUERY_DEFAULT);
/* We insert after the prev instr to avoid messing up
* -check_leaks instrumentation (PR 560871)
if (TESTALL(EFLAGS_WRITE_6, flags) && !TESTANY(EFLAGS_READ_6, flags)) {
ii->where_dead = instr_get_prev(inst);
ii->flags_dead = true;
if (options.staleness) {
/* We want to spill AFTER any clean call in case it changes mcontext */
ii->bi.spill_after = instr_get_prev(inst);
/* update liveness of whole-bb spilled regs */
fastpath_pre_instrument(drcontext, bb, inst, &ii->bi);
if (instr_uses_memory_we_track(inst)) {
if (instr_ok_for_instrument_fastpath(inst, &mi, &ii->bi)) {
instrument_fastpath(drcontext, bb, inst, &mi, false);
ii->bi.added_instru = true;
} else {
LOG(3, "fastpath unavailable "PFX": ", instr_get_app_pc(inst));
DOLOG(3, { instr_disassemble(drcontext, inst,
LOGFILE_GET(drcontext)); });
LOG(3, "\n");
ii->bi.shared_memop = opnd_create_null();
/* Restore whole-bb spilled regs (PR 489221)
* FIXME: optimize via liveness analysis
mi.reg1 = ii->bi.reg1;
mi.reg2 = ii->bi.reg2;
memset(&mi.reg3, 0, sizeof(mi.reg3));
instrument_slowpath(drcontext, bb, inst,
whole_bb_spills_enabled() ? &mi : NULL);
/* for whole-bb slowpath does interact w/ global regs */
ii->bi.added_instru = whole_bb_spills_enabled();
if (ZERO_STACK() && instr_writes_esp(inst)) {
/* any new spill must be after the alloc instru */
ii->bi.spill_after = instr_get_prev(inst);
/* we zero for leaks, and staleness does not care about xsp */
instrument_esp_adjust(drcontext, bb, inst, &ii->bi,
ii->bi.added_instru = true;
if (options.staleness)
fastpath_pre_app_instr(drcontext, bb, inst, &ii->bi, &mi);
static dr_emit_flags_t
event_bb_instru2instru(void *drcontext, void *tag, instrlist_t *bb,
bool for_trace, bool translating, void *user_data)
instru_info_t *ii = (instru_info_t *) user_data;
if (options.staleness) {
fastpath_bottom_of_bb(drcontext, tag, bb, &ii->bi, ii->bi.added_instru,
translating, false);
if (options.time_instrs) {
insert_instr_counter(drcontext, bb, instrlist_first(bb), ii->flags_dead,
/* insert after the prev instr to avoid messing up
* -check_leaks instrumentation (PR 560871)
(ii->where_dead == NULL) ?
instrlist_first(bb) : instr_get_next(ii->where_dead),
LOG(3, "final instrumentation:\n");
DOLOG(3, instrlist_disassemble(drcontext, tag, bb, LOGFILE_GET(drcontext)););
thread_free(drcontext, ii, sizeof(*ii), HEAPSTAT_PERBB);
return DR_EMIT_DEFAULT; /* deterministic */
/* statistics
* FIXME: make per-thread to avoid races (or use locked inc)
* may want some of these to be 64-bit
static void
int i;
dr_fprintf(f_global, "Statistics:\n");
dr_fprintf(f_global, "app mallocs: %8u, frees: %8u, large mallocs; %6u\n",
num_mallocs, num_frees, num_large_mallocs);
dr_fprintf(f_global, "unique malloc stacks: %8u\n", alloc_stack_count);
dr_fprintf(f_global, "app heap regions: %8u\n", heap_regions);
dr_fprintf(f_global, "peaks detected: %8u, skipped: %8u\n",
peaks_detected, peaks_skipped);
if (options.staleness) {
dr_fprintf(f_global, "staleness: needs large: %7u, needs ext: %7u\n",
stale_needs_large, stale_small_needs_ext);
/* FIXME: share w/ drmemory.c */
dr_fprintf(f_global, "\nPer-opcode slow path executions:\n");
for (i = 0; i <= OP_LAST; i++) {
if (slowpath_count[i] > 0) {
dr_fprintf(f_global, "\t%3u %10s: %12"UINT64_FORMAT_CODE"\n",
i, decode_opcode_name(i), slowpath_count[i]);
#endif /* STATISTICS */
/* drsyscall debug build uses this */
report_callstack(void *drcontext, dr_mcontext_t *mc)
#endif /* DEBUG */
static void
client_heap_add(app_pc start, app_pc end, dr_mcontext_t *mc)
LOG(2, "%s "PFX"-"PFX"\n", __FUNCTION__, start, end);
if (options.staleness)
shadow_create_shadow_memory(start, end, 0);
static void
client_heap_remove(app_pc start, app_pc end, dr_mcontext_t *mc)
/* save memory and improve performance by putting back the specials --
* except w/ our sideline thread doing sweeps we could have races
* (unlikely since this is a heap region remove) so it's not safe.
* we just live w/ the extra mem and writes.
LOG(2, "%s "PFX"-"PFX"\n", __FUNCTION__, start, end);
#if 0 /* disabled: see comment above */
if (options.staleness)
shadow_reinstate_specials_in_range(start, end);
static void
heap_iter_region(app_pc start, app_pc end _IF_WINDOWS(HANDLE heap))
client_heap_add(start, end, NULL);
heap_region_add(start, end, HEAP_PRE_US | HEAP_ARENA, NULL);
static void
heap_iter_chunk(app_pc start, app_pc end)
/* We don't have the asked-for size so we use real end for both */
malloc_add(start, end, end, true/*pre_us*/, 0, NULL, NULL);
/* Walk the heap blocks that are already allocated at client init time */
static void
heap_iterator(heap_iter_region, heap_iter_chunk _IF_WINDOWS(NULL));
/* if which_thread is >= 0, creates a file name with which_thread and
* the cur thread's tid
static file_t
open_logfile(const char *name, bool pid_log, int which_thread)
file_t f;
char logname[MAXIMUM_PATH];
IF_DEBUG(int len;)
uint extra_flags = IF_UNIX_ELSE(DR_FILE_ALLOW_LARGE, 0);
ASSERT(logsubdir[0] != '\0', "logsubdir not set up");
if (pid_log) {
IF_DEBUG(len = )
dr_snprintf(logname, BUFFER_SIZE_ELEMENTS(logname),
"%s%c%s.%d.log", logsubdir, DIRSEP, name, dr_get_process_id());
} else if (which_thread >= 0) {
IF_DEBUG(len = )
dr_snprintf(logname, BUFFER_SIZE_ELEMENTS(logname),
"%s%c%s.%d.%d.log", logsubdir, DIRSEP, name,
which_thread, dr_get_thread_id(dr_get_current_drcontext()));
/* have DR close on fork so we don't have to track and iterate */
extra_flags |= DR_FILE_CLOSE_ON_FORK;
} else {
IF_DEBUG(len = )
dr_snprintf(logname, BUFFER_SIZE_ELEMENTS(logname),
"%s%c%s", logsubdir, DIRSEP, name);
ASSERT(len > 0, "logfile name buffer max reached");
f = dr_open_file(logname, DR_FILE_WRITE_OVERWRITE | extra_flags);
ASSERT(f != INVALID_FILE, "unable to open log file");
if (which_thread > 0) {
void *drcontext = dr_get_current_drcontext();
dr_log(drcontext, LOG_ALL, 1,
"DrMemory: log for thread "TIDFMT" is %s\n",
dr_get_thread_id(drcontext), logname);
NOTIFY("thread logfile is %s"NL, logname);
return f;
/* also initializes logsubdir */
static void
uint count = 0;
const char *appnm = dr_get_application_name();
const uint LOGDIR_TRY_MAX = 1000;
/* PR 408644: pick a new subdir inside base logdir */
/* PR 453867: logdir must have pid in its name */
do {
dr_snprintf(logsubdir, BUFFER_SIZE_ELEMENTS(logsubdir),
options.logdir, DIRSEP, appnm == NULL ? "null" : appnm,
dr_get_process_id(), count);
/* FIXME PR 514092: if the base logdir is unwritable, we shouldn't loop
* UINT_MAX times: it looks like we've hung.
* Unfortuantely dr_directory_exists() is Windows-only and
* dr_create_dir returns only a bool, so for now we just
* fail if we hit 1000 dirs w/ same pid.
} while (!dr_create_dir(logsubdir) && ++count < LOGDIR_TRY_MAX);
if (count >= LOGDIR_TRY_MAX) {
NOTIFY_ERROR("Unable to create subdir in log base dir %s"NL, options.logdir);
ASSERT(false, "unable to create unique logsubdir");
f_global = open_logfile("global", true/*pid suffix*/, -1);
#ifdef UNIX
/* make it easier for wrapper script to find this logfile */
dr_fprintf(f_global, "process=%d, parent=%d\n",
dr_get_process_id(), dr_get_parent_id());
/* make sure "Dr. Heapstat" is 1st (or 2nd on linux) in file (for PR 453867) */
dr_fprintf(f_global, "Dr. Heapstat version %s\n", VERSION_STRING);
NOTIFY("log dir is %s"NL, logsubdir);
LOGF(1, f_global, "running %s\n",
(dr_get_application_name() == NULL) ? "<null>" : dr_get_application_name());
LOGF(1, f_global, "global logfile fd=%d\n", f_global);
f_callstack = open_logfile("callstack.log", false, -1);
f_snapshot = open_logfile("snapshot.log", false, -1);
if (options.staleness)
f_staleness = open_logfile("staleness.log", false, -1);
/* For long running multi-process apps like sfcbd, this can mean a lot of
* index files. With each file being 1 MB minimum on esxi, space can be
* used up fast. On ther other hand computing nudge index in postprocess
* each time can be time consuming with large number of snapshots, thereby
* affecting the vistool startup time. No simple solution. If the file
* problem gets out of hand, we might use the global log, but then that
* requires parsing the global log, which can get large.
f_nudge = open_logfile("nudge.idx", false, -1);
dr_fprintf(f_nudge, "%s snapshots\n", options.dump ? "variable" : "constant");
static void
close_file(file_t f)
/* with DRi#357, DR now isolates log files so little to do here */
#define dr_close_file DO_NOT_USE_dr_close_file
static void
reset_to_time_zero(bool keep_offs)
int i;
/* take current data and make it the cur val of to-be-snapshot 0 */
if (snap_idx != 0)
copy_snapshot(&snaps[0], &snaps[snap_idx], true/*live copy*/);
for (i = 1; i < options.snapshots; i++)
memset(&snaps[1], 0, (options.snapshots-1)*sizeof(*snaps));
if (keep_offs)
stamp_offs = stamp;
stamp = 0;
snap_idx = 0;
/* malloc_count we do not reset */
static void
event_timer(void *drcontext, dr_mcontext_t *mcontext)
bool alarm_clock = false, alarm_stale = false;
if (sideline_exit) {
#ifdef UNIX
dr_set_itimer(ITIMER_REAL, 0, event_timer);
if (options.time_clock) {
ASSERT(timer_real <= timer_clock, "timer internal error");
timer_clock -= timer_real;
if (timer_clock == 0) {
alarm_clock = true;
timer_clock = options.dump_freq*TIME_BASE_FREQ;
if (options.staleness) {
ASSERT(timer_real <= timer_stale, "timer internal error");
timer_stale -= timer_real;
if (timer_stale == 0) {
alarm_stale = true;
/* FIXME PR 553724: should -stale_granularity be increased as
* -dump_freq increases? For a long-running app it seems fine to
* make the staleness data coarser and coarser, which also improves
* performance.
timer_stale = options.stale_granularity;
/* reset now before potentially taking a while in code below, to avoid
* clock drift
if (alarm_clock) {
timestamp_last_snapshot = dr_get_milliseconds();
/* Must hold malloc lock first. We set alloc_ops.global_lock, so this
* will synchronize w/ all allocations and frees.
if (alarm_stale) {
static void
/* set real timer to the smaller and when it fires we'll adjust both */
if (options.staleness) {
if (options.time_clock) {
timer_real = (timer_clock > 0 && timer_clock < timer_stale) ?
timer_clock : timer_stale;
} else
timer_real = timer_stale;
} else {
ASSERT(options.time_clock, "timer should not be active");
timer_real = timer_clock;
ASSERT(timer_real >= 0, "timer internal error");
#ifdef UNIX
if (!dr_set_itimer(ITIMER_REAL, timer_real, event_timer))
ASSERT(false, "unable to set up timer callback\n");
static void
if (options.time_clock)
timer_clock = options.dump_freq*TIME_BASE_FREQ;
/* For -time_clock we need a timer (PR 476008): simplest to use a separate thread */
static void
sideline_run(void *arg)
void *drcontext = dr_get_current_drcontext();
/* PR 609569: we spend a lot of time holding locks, so keep running
* during synchall. Our locks keep us safe wrt leak scan, and we
* do not mutate any app state or non-persistent DR state.
if (options.thread_logs) {
/* easier to debug w/ all staleness and clock snaps in sep file */
/* FIXME i#297: have to use global heap for sideline thread */
sideline_pt = global_alloc(sizeof(*sideline_pt), HEAPSTAT_MISC);
memset(sideline_pt, 0, sizeof(*sideline_pt));
/* store it in the slot provided in the drcontext */
drmgr_set_tls_field(drcontext, tls_idx_util, (void *)sideline_pt);
utils_thread_set_file(drcontext, open_logfile("sideline.log", false, -1));
if (options.staleness)
timer_stale = options.stale_granularity;
ASSERT(options.time_clock || options.staleness, "thread should not be running");
LOG(1, "sideline thread "TIDFMT" running\n", dr_get_thread_id(drcontext));
while (!sideline_exit) {
#ifdef WINDOWS
/* FIXME: check wall-clock time and normalize to get closer to real time */
event_timer(drcontext, NULL);
#ifdef UNIX
dr_set_itimer(ITIMER_REAL, 0, event_timer);
/* i#297: we can't clean up sideline_pt so event_exit does it */
#ifdef UNIX
static void
event_fork(void *drcontext)
/* we want a whole new log dir to avoid clobbering the parent's */
tls_heapstat_t *pt = (tls_heapstat_t *)
drmgr_get_tls_field(drcontext, tls_idx_heapstat);
/* fds are shared across fork so we must duplicate */
file_t f_parent_callstack = dr_dup_file_handle(f_callstack);
/* we assume no lock is needed since only one thread post-fork */
static char buf[4096];
if (options.staleness)
/* now create new files for all 5 */
utils_thread_set_file(drcontext, f_global);
LOG(0, "new logfile after fork fd=%d\n", f_global);
/* we don't expect the user to go find the parent's data,
* so we need to duplicate the callstack file and start
* the snapshots over. we no longer keep full callstacks
* in memory (PR 496304) so we copy the file.
* so we don't have to parse and find id# num_callstacks we
* store the file position.
if (dr_file_seek(f_parent_callstack, 0, DR_SEEK_SET)) {
int64 curpos = 0;
IF_DEBUG(ssize_t sz;)
LOG(1, "copying parent callstacks "INT64_FORMAT_STRING" bytes\n", pt->filepos);
while (curpos + sizeof(buf) <= pt->filepos) {
IF_DEBUG(sz = )
dr_read_file(f_parent_callstack, buf, sizeof(buf));
ASSERT(sz == sizeof(buf), "error reading parent callstack data");
IF_DEBUG(sz = )
dr_write_file(f_callstack, buf, sizeof(buf));
ASSERT(sz == sizeof(buf), "error writing parent callstack data");
curpos += sizeof(buf);
ASSERT(pt->filepos - curpos < sizeof(buf), "buf calc error");
IF_DEBUG(sz = )
dr_read_file(f_parent_callstack, buf, pt->filepos - curpos);
ASSERT(sz == pt->filepos - curpos, "error reading parent callstack data");
IF_DEBUG(sz = )
dr_write_file(f_callstack, buf, pt->filepos - curpos);
ASSERT(sz == pt->filepos - curpos, "error writing parent callstack data");
} else
LOG(0, "ERROR: unable to copy parent callstack file\n");
reset_to_time_zero(false/*start time over*/);
#ifdef UNIX
event_signal(void *drcontext, dr_siginfo_t *info)
if (info->sig == SIGSEGV && ZERO_STACK() &&
handle_zeroing_fault(drcontext, info->access_address, info->raw_mcontext,
info->mcontext)) {
event_exception(void *drcontext, dr_exception_t *excpt)
if (excpt->record->ExceptionCode == STATUS_ACCESS_VIOLATION) {
app_pc target = (app_pc) excpt->record->ExceptionInformation[1];
if (ZERO_STACK() &&
excpt->record->ExceptionInformation[0] == 1 /* write */ &&
handle_zeroing_fault(drcontext, target, excpt->raw_mcontext,
excpt->mcontext)) {
return false;
return true;
event_restore_state_nop(void *drcontext, bool restore_memory,
dr_restore_state_info_t *info)
/* nothing: just here to avoid DR warning on zeroing loop faulting instrs */
return true;
static bool
event_filter_syscall(void *drcontext, int sysnum)
switch (sysnum) {
#ifdef UNIX
case SYS_close:
case SYS_fork:
case SYS_clone:
IF_VMX86(case 1025:)
return true;
return alloc_syscall_filter(drcontext, sysnum);
static bool
event_pre_syscall(void *drcontext, int sysnum)
tls_heapstat_t *pt = (tls_heapstat_t *)
drmgr_get_tls_field(drcontext, tls_idx_heapstat);
dr_mcontext_t *mc;
if (drsys_get_mcontext(drcontext, &mc) != DRMF_SUCCESS)
ASSERT(false, "drsys_get_mcontext failed");
DOLOG(2, {
drsys_sysnum_t sysnum_full;
drsys_syscall_t *syscall;
const char *name;
if (drsys_cur_syscall(drcontext, &syscall) != DRMF_SUCCESS)
ASSERT(false, "shouldn't fail");
if (drsys_syscall_number(syscall, &sysnum_full) != DRMF_SUCCESS)
ASSERT(false, "drsys_get_sysnum failed");
if (drsys_syscall_name(syscall, &name) != DRMF_SUCCESS)
ASSERT(false, "drsys_syscall_name failed");
LOG(2, "system call #%d==%d.%d %s\n", sysnum,
sysnum_full.number, sysnum_full.secondary, name);
handle_pre_alloc_syscall(drcontext, sysnum, mc);
#ifdef UNIX
if (sysnum == SYS_fork ||
(sysnum == SYS_clone &&
!TEST(CLONE_VM, (uint) dr_syscall_get_param(drcontext, 0)))
/* FIXME: if open-sourced we should split this.
* Presumably we'll have the bora shlib split before then.
IF_VMX86(|| sysnum == 1025)) {
/* Store the file offset in the client_data field, shared across callbacks */
pt->filepos = dr_file_tell(f_callstack);
LOG(1, "SYS_fork: callstack file @ "INT64_FORMAT_STRING"\n", pt->filepos);
return true; /* execute syscall */
static void
event_post_syscall(void *drcontext, int sysnum)
dr_mcontext_t *mc;
if (drsys_get_mcontext(drcontext, &mc) != DRMF_SUCCESS)
ASSERT(false, "drsys_get_mcontext failed");
handle_post_alloc_syscall(drcontext, sysnum, mc);
check_reachability(bool at_exit)
if (options.check_leaks) {
void *drcontext = dr_get_current_drcontext();
ssize_t len = 0;
size_t sofar = 0;
char *buf;
size_t bufsz;
get_buffer(drcontext, &buf, &bufsz);
BUFPRINT(buf, bufsz, sofar, len,
"ERRORS IGNORED:\n %5d still-reachable allocation(s)\n",
if (!options.show_reachable) {
BUFPRINT(buf, bufsz, sofar, len,
" (re-run with \"-check_leaks -show_reachable\""
" for details)\n");
print_buffer(f_global, buf);
release_buffer(drcontext, buf, bufsz);
static void
print_nudge_header(file_t f)
dr_fprintf(f, "NUDGE @ %16"INT64_FORMAT"u %s\n\n",
snaps[snap_idx].stamp, unit_name());
static void
event_nudge(void *drcontext, uint64 argument)
uint64 snapshot_fpos = 0;
uint64 staleness_fpos = 0;
/* PR 476043: use nudge to output snapshots for daemon app. For
* now we have only one use, so we don't need the argument, but we
* may want an option to reset the start point for whole-run
* vs since-last-nudge snapshots: though that would require
* changing all usage types to be signed (PR 553707) so we
* keep everything absolute w/ no reset on nudge.
/* We hold the malloc lock across the snapshot + reset to prevent
* new allocs being added in between
malloc_lock(); /* must be acquired before snapshot_lock */
if (options.dump) {
/* For const # snapshots, we want the peak to be the global peak for the
* whole run, regardless of how many nudges were done.
* For dump-as-you-go, we want a local peak since the last nudge.
memset(&snap_peak, 0, sizeof(snap_peak));
if (options.staleness)
/* Print the nudge index information, i.e., for each nudge, print the file
* position in snapshot.log that marks the begining of new data after nudge
* data was dumped. This makes nudge handling efficient as opposed to
* building the index during the post process stage which would require a
* full pass of the snapshot.log file which can be large. The same applies
* to staleness.log. Also specify whether snapshots are fixed in number or
* variable (-dump). PR 502468.
snapshot_fpos = dr_file_tell(f_snapshot);
if (options.staleness)
staleness_fpos = dr_file_tell(f_staleness);
ASSERT(snapshot_fpos >= 0 && staleness_fpos >= 0, "bad log file location");
dr_fprintf(f_nudge, "%d,%"INT64_FORMAT"u,%"INT64_FORMAT"u\n",
nudge_count, snapshot_fpos, staleness_fpos);
NOTIFY("Received nudge"NL, logsubdir);
static void
event_module_load(void *drcontext, const module_data_t *info, bool loaded)
callstack_module_load(drcontext, info, loaded);
alloc_module_load(drcontext, info, loaded);
static void
event_module_unload(void *drcontext, const module_data_t *info)
callstack_module_unload(drcontext, info);
alloc_module_unload(drcontext, info);
static void
event_fragment_delete(void *drcontext, void *tag)
instrument_fragment_delete(drcontext, tag);
alloc_fragment_delete(drcontext, tag);
static void
event_thread_init(void *drcontext)
uint which_thread = atomic_add32_return_sum((volatile int *)&num_threads, 1) - 1;
file_t f;
tls_heapstat_t *pt = (tls_heapstat_t *)
thread_alloc(drcontext, sizeof(*pt), HEAPSTAT_MISC);
memset(pt, 0, sizeof(*pt));
drmgr_set_tls_field(drcontext, tls_idx_heapstat, (void *)pt);
ELOGF(0, f_global, "new thread #%d id=%d\n",
which_thread, dr_get_thread_id(drcontext));
if (!options.thread_logs) {
f = f_global;
} else {
/* we're going to dump our data to a per-thread file */
f = open_logfile("thread", false, which_thread/*tid suffix*/);
LOGPT(1, PT_GET(drcontext), "thread logfile fd=%d\n", f);
utils_thread_set_file(drcontext, f);
pt->errbufsz = MAX_ERROR_INITIAL_LINES + max_callstack_size();
pt->errbuf = (char *) thread_alloc(drcontext, pt->errbufsz, HEAPSTAT_MISC);
LOGPT(2, PT_GET(drcontext), "in event_thread_init()\n");
if (options.check_leaks || options.staleness)
if (options.staleness)
static void
event_thread_exit(void *drcontext)
tls_heapstat_t *pt = (tls_heapstat_t *)
drmgr_get_tls_field(drcontext, tls_idx_heapstat);
LOGPT(2, PT_GET(drcontext), "in event_thread_exit()\n");
if (options.staleness)
thread_free(drcontext, (void *) pt->errbuf, pt->errbufsz, HEAPSTAT_MISC);
/* with PR 536058 we do have dcontext in exit event so indicate explicitly
* that we've cleaned up the per-thread data
drmgr_set_tls_field(drcontext, tls_idx_heapstat, NULL);
thread_free(drcontext, pt, sizeof(*pt), HEAPSTAT_MISC);
static bool
alloc_itercb_exit(malloc_info_t *info, void *iter_data)
if (options.staleness)
staleness_free_per_alloc((stale_per_alloc_t *)info->client_data);
return true;
static void
LOGF(2, f_global, "in event_exit\n");
if (options.time_clock || options.staleness) {
sideline_exit = true;
if (options.thread_logs) {
/* i#297: sideline_run never gets a chance to clean up so we do it */
ASSERT(sideline_pt != NULL, "sideline per-thread error");
global_free(sideline_pt, sizeof(*sideline_pt), HEAPSTAT_MISC);
if (options.check_leaks) {
if (options.staleness)
malloc_iterate(alloc_itercb_exit, NULL);
alloc_exit(); /* must be before deleting alloc_stack_table */
heap_region_exit(); /* must be after alloc_exit */
LOG(1, "final alloc stack table size: %u bits, %u entries\n",
alloc_stack_table.table_bits, alloc_stack_table.entries);
if (options.check_leaks || options.staleness) {
if (options.use_symcache)
if (umbra_exit() != DRMF_SUCCESS)
ASSERT(false, "fail to finalize umbra");
if (drsys_exit() != DRMF_SUCCESS)
ASSERT(false, "drsys failed to exit");
dr_fprintf(f_global, "LOG END\n");
dr_fprintf(f_callstack, "LOG END\n");
dr_fprintf(f_snapshot, "LOG END\n");
if (options.staleness) {
dr_fprintf(f_staleness, "LOG END\n");
dr_init(client_id_t client_id)
const char *opstr = dr_get_options(client_id);
alloc_options_t alloc_ops;
drsys_options_t ops = { sizeof(ops), 0, };
callstack_options_t callstack_ops = { sizeof(callstack_ops), 0 };
drmgr_priority_t priority = {sizeof(priority), "drheapstat", NULL, NULL, 1000};
dr_set_client_name("Dr. Heapstat", "");
ASSERT(opstr != NULL, "error obtaining option string");
drmgr_init(); /* must be before utils_init and any other tls/cls uses */
tls_idx_heapstat = drmgr_register_tls_field();
ASSERT(tls_idx_heapstat > -1, "unable to reserve TLS slot");
if (drsys_init(client_id, &ops) != DRMF_SUCCESS)
ASSERT(false, "drsys failed to init");
if (umbra_init(client_id) != DRMF_SUCCESS)
ASSERT(false, "Umbra failed to init");
/* now that we know whether -quiet, print basic info */
NOTIFY("Dr. Heapstat version %s"NL, VERSION_STRING);
NOTIFY("options are \"%s\""NL, opstr);
LOG(0, "options are \"%s\"\n", opstr);
if (!drmgr_register_bb_instrumentation_ex_event
(event_bb_app2app, event_bb_analysis, event_bb_insert,
event_bb_instru2instru, &priority))
ASSERT(false, "drmgr registration failed");
#ifdef UNIX
if (ZERO_STACK()) {
#ifdef UNIX
/* simplest to filter all for pre-syscall-arg access: else we'd
* need to ask alloc for all that it cares about.
if (drsys_filter_all_syscalls() != DRMF_SUCCESS)
ASSERT(false, "drsys_filter_all_syscalls should never fail");
dr_register_nudge_event(event_nudge, client_id);
if (options.staleness)
/* DR complains if we have xl8 fields and faults w/o restore-state events */
else if (ZERO_STACK())
if (options.staleness)
/* make it easy to tell, by looking at log file, which client executed */
dr_log(NULL, LOG_ALL, 1, "client = Dr. Heapstat version %s\n", VERSION_STRING);
if (options.use_symcache) {
if (!option_specified.symcache_dir) {
dr_snprintf(options.symcache_dir, BUFFER_SIZE_ELEMENTS(options.symcache_dir),
"%s/symcache", options.logdir);
drsymcache_init(client_id, options.symcache_dir, 0);
callstack_ops.global_max_frames = options.callstack_max_frames;
callstack_ops.stack_swap_threshold = 0x10000;
/* default flags: but if we have apps w/ DGC we may
* want to expose some flags as options
callstack_ops.fp_flags = 0;
/* scan forward 1 page: good compromise bet perf (scanning
* can be the bottleneck) and good callstacks
callstack_ops.fp_scan_sz = PAGE_SIZE;
/* XXX i#926: symbolize and suppress leaks online (and then
* use options.callstack_style here)
callstack_ops.print_flags = PRINT_FOR_POSTPROCESS;
callstack_ops.ignore_xbp = IF_WINDOWS_ELSE(is_in_seh_unwind, NULL);
/* XXX: may need better callstack heuristics w/o shadow info
* if user turns off stack zeroing from -leaks_only
callstack_ops.old_retaddrs_zeroed = options.zero_retaddr;
callstack_ops.tool_lib_ignore = IF_WINDOWS_ELSE("drheapstatlib.dll",
callstack_ops.bad_fp_list = options.callstack_bad_fp_list;
heap_region_init(client_heap_add, client_heap_remove);
/* We keep callstacks around forever and only free when we delete
* the alloc_stack_table, so no refcounts
/* must be before heap_walk() and alloc_init() */
if (options.check_leaks || options.staleness)
hashtable_init_ex(&alloc_stack_table, ASTACK_TABLE_HASH_BITS, HASH_CUSTOM,
false/*!str_dup*/, false/* !synch; + higher-level synch covered
* by malloc_table's lock */,
#ifdef USE_MD5
(uint (*)(void*)) md5_hash,
(bool (*)(void*, void*)) md5_digests_equal
(uint (*)(void*)) crc32_whole_and_half_hash,
(bool (*)(void*, void*)) crc32_whole_and_half_equal
hashtable_init_ex(&alloc_md5_table, ASTACK_TABLE_HASH_BITS, HASH_CUSTOM,
false/*!str_dup*/, false/* !synch; + higher-level synch covered
* by malloc_table's lock */,
NULL/*points at md5 data stored in alloc_stack_table*/,
(uint (*)(void*)) md5_hash,
(bool (*)(void*, void*)) md5_digests_equal);
memset(&alloc_ops, 0, sizeof(alloc_ops));
alloc_ops.track_allocs = true;
alloc_ops.track_heap = true;
alloc_ops.redzone_size = 0; /* no redzone */
alloc_ops.size_in_redzone = false;
alloc_ops.record_allocs = true;
alloc_ops.get_padded_size = true;
alloc_ops.cache_postcall = false;
alloc_ops.intercept_operators = false;
alloc_ops.conservative = options.conservative;
alloc_ops.global_lock = true; /* we want to serialize w/ our snapshots */
alloc_ops.replace_malloc = true;
alloc_ops.use_symcache = options.use_symcache;
alloc_init(&alloc_ops, sizeof(alloc_ops));
/* must be after heap_region_init and snapshot_init */
if (options.check_leaks || options.staleness)
if (options.time_clock)
timestamp_last_snapshot = dr_get_milliseconds();
if (options.time_clock || options.staleness) {
if (!dr_create_client_thread(sideline_run, NULL)) {
ASSERT(false, "unable to create thread");
if (options.check_leaks) {
leak_init(false/*no defined info*/,