drheapstat/drheapstat.c - external/github.com/DynamoRIO/drmemory - Git at Google

 /* **********************************************************
  * Copyright (c) 2010-2017 Google, Inc.  All rights reserved.
  * Copyright (c) 2009-2010 VMware, Inc.  All rights reserved.
  * **********************************************************/

 /* Dr. Memory: the memory debugger
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation;
  * version 2.1 of the License, and no later version.

  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  * Library General Public License for more details.

  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */

 /***************************************************************************
  * Dr. Heapstat heap profiler
  */

 #include "dr_api.h"
 #include "drwrap.h"
 #include "drsyscall.h"
 #include "drsymcache.h"
 #include "umbra.h"
 #include "drheapstat.h"
 #include "alloc.h"
 #include "heap.h"
 #include "callstack.h"
 #include "crypto.h"
 #include "staleness.h"
 #include "../drmemory/leak.h"
 #include "../drmemory/stack.h"
 #include "../drmemory/shadow.h"
 #include "../drmemory/instru.h"
 #include "../drmemory/slowpath.h"
 #ifdef MACOS
 # error NYI i#1438
 #elif defined(LINUX)
 # include "sysnum_linux.h"
 # include <errno.h>
 # define _GNU_SOURCE /* for sched.h */
 # include <linux/sched.h>  /* for CLONE_VM */
 # include <sys/time.h>
 # include <signal.h> /* for SIGSEGV */
 #endif
 #include <stddef.h> /* for offsetof */

 #ifdef USE_MD5
 # define IF_MD5_ELSE(x, y) x
 #else
 # define IF_MD5_ELSE(x, y) y
 #endif

 /* for sharing data among instrumentation passes */
 typedef struct _instru_info_t {
     bb_info_t bi;
     instr_t *where_dead;
     bool flags_dead;
     uint instrs_in_bb;
 } instru_info_t;

 char logsubdir[MAXIMUM_PATH];
 file_t f_callstack = INVALID_FILE;
 file_t f_snapshot = INVALID_FILE;
 file_t f_staleness = INVALID_FILE;
 file_t f_nudge = INVALID_FILE;      /* PR 502468 - nudge visualization */
 static uint num_threads;

 /* Counters for time unit intervals */
 static int instr_count;
 static int byte_count;
 static int allocfree_count;

 /* For -time_clock, the frequency is -dump_freq*10 milliseconds */
 #define TIME_BASE_FREQ 10
 /* These are all in milliseconds */
 static uint timer_clock;
 static uint timer_stale;
 static uint timer_real;

 /* Needed to compute the current partial snapshot (PR 548013) */
 uint64 timestamp_last_snapshot;

 static volatile bool sideline_exit;
 /* FIXME i#297: DR synchs + terminates our sideline thread prior to
  * calling our exit event so we have no chance to clean up its memory.
  * DR then asserts about leaks.  Using a targeted solution for now.
  */
 static tls_util_t *sideline_pt;
 static int leak_count;
 static int reachable_leak_count;

 /* To avoid the code expansion from a clean call in every bb we use
  * a shared clean call sequence, entering it with a direct jump
  * and exiting with an indirect jump to a stored return point.
  */
 static byte *shared_instrcnt_callout;
 static byte *shared_code_region;
 #define SHARED_CODE_SIZE \
     (PAGE_SIZE + (options.staleness ? (SHARED_SLOWPATH_SIZE) : 0))

 /* We serialize snapshots since rare so not costly perf-wise, and this avoids
  * needing potentially very large buffers to try and get atomic writes
  */
 static void *snapshot_lock;

 /* We intercept libc/ntdll allocation routines instead of providing our
  * own, for maximum transparency.  Both use 8-byte (for 32-bit) headers.
  * FIXME: for 64-bit Windows, 8-byte-or-smaller allocs have special headers
  * that are only 8 bytes instead of 16?
  */
 #ifdef UNIX
 /* FIXME: mmap chunks have 2*size headers (xref PR 474912) */
 # define HEADER_SIZE sizeof(size_t)
 #else
 # define HEADER_SIZE 2*sizeof(size_t)
 #endif

 static void reset_clock_timer(void);
 static void reset_real_timer(void);
 static void event_thread_exit(void *drcontext);

 #ifdef STATISTICS
 static void
 dump_statistics(void);
 # define STATS_DUMP_FREQ 10000
 uint alloc_stack_count;
 static uint peaks_detected;
 static uint peaks_skipped;
 #endif

 /* PR 465174: share allocation site callstacks.
  * This table should only be accessed while holding the lock for
  * malloc_table (via malloc_lock(), which we enable via
  * alloc_ops.global_lock), which makes the coordinated operations with
  * malloc_table atomic.
  */
 #define ASTACK_TABLE_HASH_BITS 8
 static hashtable_t alloc_stack_table;
 #ifdef CHECK_WITH_MD5
 /* Used to check collisions with crc32 */
 static hashtable_t alloc_md5_table;
 #endif

 /***************************************************************************
  * TLS and CLS
  */

 typedef struct _tls_heapstat_t {
     char *errbuf; /* buffer for atomic writes */
     size_t errbufsz;
 # ifdef UNIX
     int64 filepos; /* f_callstack file position */
 # endif
 } tls_heapstat_t;

 /* XXX: share w/ syscall_os.h */
 #ifdef WINDOWS
 # define SYSCALL_NUM_ARG_STORE 14
 #else
 # define SYSCALL_NUM_ARG_STORE 6 /* 6 is max on Linux */
 #endif

 static int tls_idx_heapstat = -1;

 /***************************************************************************
  * OPTIONS
  */

 static void
 reset_intervals(void)
 {
     if (options.dump) {
         options.snapshots = 1;
     } else {
         options.dump_freq = 1;
     }

     /* Instr threshold is per 1k, so a max of 2 trillion instrs.
      * We could support more and still do 32-bit inlined arith by
      * having the callout do a mod.
      */
     if (options.time_instrs) {
         if (options.dump_freq > UINT_MAX/1000)
             usage_error("-dump_freq: value too large", "");
         options.dump_freq *= 1000;
         /* We count backward */
         instr_count = options.dump_freq;
     } else if (options.time_bytes) {
         if (!options.dump) {
             /* only *8 if doing const # snapshots.  this is invisible
              * to user so no need to talk about in docs: just makes
              * initial snapshots more reasonable than starting at 1.
              */
             ASSERT(options.dump_freq == 1, "invalid assumption");
             options.dump_freq = 8;
         }
         /* We count backward */
         byte_count = options.dump_freq;
     } else if (options.time_allocs) {
         /* We count backward */
         allocfree_count = options.dump_freq;
     }
 }

 static void
 drheap_options_init(const char *opstr)
 {
     options_init(opstr);

     reset_intervals();

     /* set globals */
     op_print_stderr = options.use_stderr;
     op_verbose_level = options.verbose;
     op_pause_at_assert = options.pause_at_assert;
     op_pause_via_loop = options.pause_via_loop;
     op_ignore_asserts = options.ignore_asserts;
     op_use_symcache = options.use_symcache;
 }

 /***************************************************************************
  * EVENTS FOR COMMON/ALLOC.C
  */

 /* Snapshot arrays: of size options.snapshots.  For !options.dump,
  * when the arrays fill, we double the options.dump_freq frequency and
  * then replace snapshots that would not have existed if the
  * frequency had been the double one from the beginning.
  */
 static uint snap_idx;
 static uint snap_fills;

 /* We keep a linked list of these structs per snapshot.
  * One struct per callstack that has non-zero usage in that snapshot.
  * This can save a lot of memory versus arrays when there are
  * many callstacks and few are present in all snapshots.
  * Xref PR 493134.
  */
 typedef struct _heap_used_t {
     /* FIXME: 64-bit? */
     uint instances;
     uint bytes_asked_for;
     ushort extra_usable;   /* beyond bytes_asked_for */
     ushort extra_occupied; /* beyond bytes_asked_for + extra_usable */
     struct _heap_used_t *next;
     per_callstack_t *callstack;
 } heap_used_t;

 /* Arrays of snapshots.  Not using a heap_used_t b/c we need larger counters. */
 typedef struct _per_snapshot_t {
     uint64 stamp;
     uint64 tot_mallocs;
     uint64 tot_bytes_asked_for;
     uint64 tot_bytes_usable;
     uint64 tot_bytes_occupied;
     /* Linked list of non-zero usage per callstack */
     heap_used_t *used;
     /* Staleness data: array with one entry per live malloc */
     stale_snap_allocs_t *stale;
 } per_snapshot_t;

 static uint64 stamp;
 /* Used for starting time 0 in middle of run (post-nudge usually) */
 static uint64 stamp_offs;
 static per_snapshot_t *snaps;
 /* peak snapshot (PR 476018) */
 static per_snapshot_t snap_peak;
 /* track changes in # allocs+frees for PR 566116 */
 static uint64 allocfree_cur, allocfree_last_peak;
 #define SNAPSHOT_LOG_BUF_SIZE   (32*1024)
 static char snaps_log_buf[SNAPSHOT_LOG_BUF_SIZE];   /* PR 551841 */

 struct _per_callstack_t {
     uint id;
 #if defined(USE_MD5) || defined(CHECK_WITH_MD5)
     /* PR 496304: we keep just the md5 to save memory.  We either do so
      * instead of crc32, or in addition as a debug check on collisions.
      */
     byte md5[MD5_RAW_BYTES];
 #endif
 #ifndef USE_MD5
     /* PR 496304: we keep just a checksum to save memory.  Since crc32 could
      * collide we use crc32 of the whole callstack plus a separate crc32
      * of the first half of the callstack.
      */
     uint crc[2];
 #endif
     /* for the current snapshot */
     heap_used_t *used;
     /* for node removal w/o keeping a prev per heap_used_t per snapshot */
     heap_used_t *prev_used;
 };

 static uint num_callstacks;
 static uint snapshot_count;
 static uint nudge_count;

 uint
 get_cstack_id(per_callstack_t *per)
 {
     return per->id;
 }

 static inline per_callstack_t *
 get_cstack_from_alloc_data(void *client_data)
 {
     /* We store either per_callstack_t or stale_per_alloc_t in the
      * client_data slot in each malloc
      */
     if (options.staleness)
         return ((stale_per_alloc_t *)client_data)->cstack;
     else
         return (per_callstack_t *) client_data;
 }

 void
 alloc_callstack_free(void *p)
 {
     per_callstack_t *per = (per_callstack_t *) p;
     global_free(per, sizeof(*per), HEAPSTAT_CALLSTACK);
 }

 void
 client_malloc_data_free(void *data)
 {
     /* nothing to do since we persist our callstacks in alloc_stack_table */
 }

 void *
 client_malloc_data_to_free_list(void *cur_data, dr_mcontext_t *mc, app_pc post_call)
 {
     /* nothing to do since we persist our callstacks in alloc_stack_table */
     return cur_data;
 }

 void *
 client_malloc_data_free_split(void *cur_data)
 {
     /* Just keep the same data.  There's no use-after-free b/c we don't free
      * it in client_malloc_data_free().
      */
     return cur_data;
 }

 static void
 get_buffer(void *drcontext, char **buf/*OUT*/, size_t *bufsz/*OUT*/)
 {
     tls_heapstat_t *pt = (tls_heapstat_t *)
         drmgr_get_tls_field(drcontext, tls_idx_heapstat);
     ASSERT(buf != NULL && bufsz != NULL, "invalid args");
     if (pt == NULL) {
         /* at init time no pt yet */
         *bufsz = MAX_ERROR_INITIAL_LINES + max_callstack_size();
         *buf = (char *) global_alloc(*bufsz, HEAPSTAT_CALLSTACK);
     } else {
         *buf = pt->errbuf;
         *bufsz = pt->errbufsz;
     }
 }

 static void
 release_buffer(void *drcontext, char *buf, size_t bufsz)
 {
     tls_heapstat_t *pt = (tls_heapstat_t *)
         drmgr_get_tls_field(drcontext, tls_idx_heapstat);
     if (pt == NULL) {
         global_free(buf, bufsz, HEAPSTAT_CALLSTACK);
     }
 }

 static const char *
 unit_name(void)
 {
     if (options.time_instrs)
         return "instrs";
     if (options.time_allocs)
         return "mallocs";
     if (options.time_bytes)
         return "bytes";
     if (options.time_clock)
         return "ticks (10ms each)";
     return "<error>";
 }

 /* Up to caller to synchronize */
 static void
 dump_snapshot(per_snapshot_t *snap, int idx/*-1 means peak*/)
 {
     heap_used_t *u;
     size_t sofar = 0;
     ssize_t len = 0;

     LOG(2, "dumping snapshot idx=%d count=%"INT64_FORMAT"u\n",
         idx, snap->stamp);
     dr_fprintf(f_snapshot, "SNAPSHOT #%4d @ %16"INT64_FORMAT"u %s\n",
                snapshot_count, snap->stamp + stamp_offs, unit_name());
     dr_fprintf(f_snapshot, "idx=%d, stamp_offs=%16"INT64_FORMAT"u\n",
                idx, stamp_offs);
     dr_fprintf(f_snapshot, "total: %"INT64_FORMAT"u,%"INT64_FORMAT"u,%"
                INT64_FORMAT"u,%"INT64_FORMAT"u\n",
                snap->tot_mallocs, snap->tot_bytes_asked_for,
                snap->tot_bytes_usable, snap->tot_bytes_occupied);

     for (u = snap->used; u != NULL; u = u->next) {
         if (u->bytes_asked_for + u->extra_usable > 0) {
             /* PR 551841: buffer snapshot output else performance is bad. */
             BUFFERED_WRITE(f_snapshot, snaps_log_buf, SNAPSHOT_LOG_BUF_SIZE,
                            sofar, len, "%u,%u,%u,%u,%u\n",
                            u->callstack->id, u->instances, u->bytes_asked_for,
                            u->extra_usable, u->extra_occupied);
         }
     }
     FLUSH_BUFFER(f_snapshot, snaps_log_buf, sofar);

     if (options.staleness) {
         uint i;
         dr_fprintf(f_staleness, "SNAPSHOT #%4d @ %16"INT64_FORMAT"u %s\n",
                    snapshot_count, snap->stamp + stamp_offs, unit_name());
         /* FIXME: optimize by listing cstack id only once; or binary format.
          * If still too big then collapse similar timestamps and give up
          * some runtime flexibility in granularity
          */
         sofar = 0;
         for (i = 0; snap->stale != NULL && i < snap->stale->num_entries; i++) {
             /* PR 551841: improve perf by printing to buffer to reduce # file writes */
             BUFFERED_WRITE(f_staleness, snaps_log_buf, SNAPSHOT_LOG_BUF_SIZE,
                            sofar, len, "%u,%u,%"INT64_FORMAT"u\n",
                            staleness_get_snap_cstack_id(snap->stale, i),
                            staleness_get_snap_bytes(snap->stale, i),
                            staleness_get_snap_last_access(snap->stale, i));
         }
         FLUSH_BUFFER(f_staleness, snaps_log_buf, sofar);
     }

     snapshot_count++;
 }

 /* Caller must hold snapshot_lock */
 static void
 free_snapshot(per_snapshot_t *snap)
 {
     heap_used_t *u, *nxt_u;
     for (u = snap->used; u != NULL; u = nxt_u) {
         nxt_u = u->next;
         global_free(u, sizeof(*u), HEAPSTAT_SNAPSHOT);
     }
     snap->used = NULL;
     if (options.staleness && snap->stale != NULL) {
         staleness_free_snapshot(snap->stale);
         snap->stale = NULL;
     }
 }

 /* Caller must hold snapshot_lock.
  * Calls free_snapshot on dst first.
  * If new_live is true, dst is the new "live" in-progress snapshot
  * whose list is where callstack table entries point.
  */
 static void
 copy_snapshot(per_snapshot_t *dst, per_snapshot_t *src, bool new_live)
 {
     heap_used_t *u, *nxt_u, *prev_u;
     int i;
     /* Replace the existing list at dst with a clone of
      * src, and update the callstack table pointers.  First
      * we clear the callstack table pointers in case pointing to
      * entries not in the new list.
      */
     ASSERT(src != dst, "cannot copy to self");

     free_snapshot(dst);

     memcpy(dst, src, sizeof(*dst));
     if (options.staleness) {
         /* We fill this in at snapshot time */
         dst->stale = NULL;
     }

     hashtable_lock(&alloc_stack_table);
     if (new_live) {
         for (i = 0; i < HASHTABLE_SIZE(alloc_stack_table.table_bits); i++) {
             hash_entry_t *he;
             for (he = alloc_stack_table.table[i]; he != NULL; he = he->next) {
                 per_callstack_t *per = (per_callstack_t *) he->payload;
                 per->used = NULL;
                 per->prev_used = NULL;
             }
         }
     }

     prev_u = NULL;
     for (u = src->used; u != NULL; u = u->next) {
         nxt_u = (heap_used_t *) global_alloc(sizeof(*nxt_u), HEAPSTAT_SNAPSHOT);
         memcpy(nxt_u, u, sizeof(*nxt_u));
         if (prev_u == NULL)
             dst->used = nxt_u;
         else
             prev_u->next = nxt_u;
         if (new_live) {
             nxt_u->callstack->used = nxt_u;
             nxt_u->callstack->prev_used = prev_u;
         }
         nxt_u->next = NULL;
         prev_u = nxt_u;
     }
     hashtable_unlock(&alloc_stack_table);
 }

 static bool
 difference_exceeds_percent(uint64 new_val, uint64 old_val, uint percent)
 {
     /* Avoid floating-point via 100*.  Assuming no overflow b/c uint64. */
     uint64 diff = (new_val > old_val) ? (new_val - old_val) : (old_val - new_val);
     return (100 * diff > percent * old_val);
 }

 /* If the current snap_idx snapshot is larger than the current peak,
  * makes a new peak snapshot (PR 476018).
  * Assumes snapshot lock and malloc_lock are held.
  */
 static void
 check_for_peak(void)
 {
     if (snaps[snap_idx].tot_bytes_occupied > snap_peak.tot_bytes_occupied) {
         /* PR 566116: avoid too-frequent peak snapshots by ignoring if the new
          * peak is similar to the existing one, both in size and in malloc
          * makeup.  May need to split -peak_threshold into 3 if we need
          * separate control of each variable.
          */
         if (difference_exceeds_percent(snaps[snap_idx].tot_bytes_occupied,
                                        snap_peak.tot_bytes_occupied,
                                        options.peak_threshold) ||
             difference_exceeds_percent(allocfree_cur, allocfree_last_peak,
                                        snap_peak.tot_bytes_occupied) ||
             /* even if not much different, if it's been a long time, use it */
             difference_exceeds_percent(snaps[snap_idx].stamp,
                                        snap_peak.stamp,
                                        options.peak_threshold)) {
             STATS_INC(peaks_detected);
             allocfree_last_peak = allocfree_cur;
             copy_snapshot(&snap_peak, &snaps[snap_idx], false/*isolated copy*/);
             if (options.staleness) {
                 /* copy_snapshot called free_snapshot which freed this */
                 ASSERT(snap_peak.stale == NULL, "invalid staleness data");
                 snap_peak.stale = staleness_take_snapshot(stamp);
             }
             LOG(2, "new peak snapshot, tot occupied=%"INT64_FORMAT"u\n",
                 snap_peak.tot_bytes_occupied);
         } else {
             STATS_INC(peaks_skipped);
             LOG(2, "NOT taking new peak snapshot, tot occupied=%"INT64_FORMAT"u\n",
                 snap_peak.tot_bytes_occupied);
         }
     }
 }

 /* Caller must hold malloc_lock() */
 static void
 take_snapshot(void)
 {
     uint prev_idx;
     /* We serialize snapshots since rare so not costly perf-wise, and this avoids
      * needing potentially very large buffers to try and get atomic writes
      */
     dr_mutex_lock(snapshot_lock);
     if (options.staleness) {
         /* Unlike the mem usage data which is maintained as the app
          * executes, we have to go collect this at snapshot time from
          * the malloc table.
          */
         if (options.dump && snaps[snap_idx].stale != NULL) {
             staleness_free_snapshot(snaps[snap_idx].stale);
             snaps[snap_idx].stale = NULL;
         }
         ASSERT(snaps[snap_idx].stale == NULL, "invalid staleness data");
         snaps[snap_idx].stale = staleness_take_snapshot(stamp);
     }
     if (options.dump) {
         snaps[snap_idx].stamp += options.dump_freq;
         dump_snapshot(&snaps[snap_idx], snap_idx);
     } else {
         stamp += options.dump_freq;
         snaps[snap_idx].stamp = stamp;
         prev_idx = snap_idx;
         LOG(2, "take_snapshot @idx=%u stamp=%"INT64_FORMAT"u\n", prev_idx, stamp);
         /* Check for peak on every snapshot (PR 476018) */
         check_for_peak();
         /* Find the next one we should overwrite.  Keep those aligned
          * w/ current dump_freq.
          */
         do {
             snap_idx++;
             if (snap_idx >= options.snapshots) {
                 snap_fills++;
                 snap_idx = 0;
                 options.dump_freq *= 2;
                 LOG(1, "adjusting snapshots: new freq=%u\n", options.dump_freq);
                 if (options.time_clock)
                     reset_clock_timer();
             }
         } while (snaps[snap_idx].stamp > 0 &&
                  (snaps[snap_idx].stamp % options.dump_freq) == 0);

         /* Replace the existing list at snap_idx with a clone of prev_idx */
         copy_snapshot(&snaps[snap_idx], &snaps[prev_idx], true/*live copy*/);
     }
     dr_mutex_unlock(snapshot_lock);
 }

 /* Called from pre-alloc-hashtable-change events.
  * Updates the current snapshot and callstack usage.
  */
 static void
 account_for_bytes_pre(per_callstack_t *per, int asked_for,
                       int extra_usable, int extra_occupied, bool realloc)
 {
     /* must be synched w/ take_snapshot().  the malloc lock is always acquired
      * before the snapshot lock.
      */
     dr_mutex_lock(snapshot_lock);
     if (asked_for+extra_usable > 0) {
         if (per->used == NULL) {
             per->used = (heap_used_t *)
                 global_alloc(sizeof(*per->used), HEAPSTAT_SNAPSHOT);
             memset(per->used, 0, sizeof(*per->used));
             per->used->callstack = per;
             per->used->next = snaps[snap_idx].used;
             if (snaps[snap_idx].used != NULL) {
                 ASSERT(snaps[snap_idx].used->callstack->prev_used == NULL,
                        "prev_used should already be null");
                 snaps[snap_idx].used->callstack->prev_used = per->used;
             }
             ASSERT(per->prev_used == NULL, "prev_used should already be null");
             snaps[snap_idx].used = per->used;
         }
         if (!realloc)
             per->used->instances++;
         snaps[snap_idx].tot_mallocs++;
     } else {
         ASSERT(asked_for+extra_usable < 0, "cannot have 0-sized usable space");
         ASSERT(per->used != NULL, "alloc must exist");
         ASSERT(per->used->instances >= 0, "alloc count must be >= 0");
         ASSERT(snaps[snap_idx].tot_mallocs >= 0, "alloc count must be >= 0");
         if (!realloc)
             per->used->instances--;
         snaps[snap_idx].tot_mallocs--;
     }
     per->used->bytes_asked_for += asked_for;
     per->used->extra_usable += extra_usable;
     per->used->extra_occupied += extra_occupied;
     LOG(2, "callstack id %u => %ux, %uB, +%uB, +%uB\n", per->id,
         per->used->instances, per->used->bytes_asked_for,
         per->used->extra_usable, per->used->extra_occupied);
     if (per->used->instances == 0) {
         /* remove the node to save memory since may not re-alloc */
         ASSERT(per->used->bytes_asked_for == 0, "no malloc => no bytes!");
         if (per->used->next != NULL)
             per->used->next->callstack->prev_used = per->prev_used;
         if (per->prev_used == NULL) {
             ASSERT(per->used == snaps[snap_idx].used, "prev node error");
             snaps[snap_idx].used = per->used->next;
         } else {
             per->prev_used->next = per->used->next;
         }
         global_free(per->used, sizeof(*per->used), HEAPSTAT_SNAPSHOT);
         per->used = NULL;
         per->prev_used = NULL;
     }
     snaps[snap_idx].tot_bytes_asked_for += asked_for;
     snaps[snap_idx].tot_bytes_usable += asked_for + extra_usable;
     snaps[snap_idx].tot_bytes_occupied += asked_for + extra_usable + extra_occupied;
     dr_mutex_unlock(snapshot_lock);
 }

 /* Called from post-alloc-hashtable-change events which is important to
  * have a consistent view in the staleness hashtable walk (PR 567117).
  * Updates the -time_allocs and -time_bytes counters and potentially
  * takes snapshots.
  */
 static void
 account_for_bytes_post(int asked_for, int extra_usable, int extra_occupied)
 {
     if (options.time_bytes) {
         /* PR 545288: consider dealloc as well as alloc */
         int diff = asked_for + extra_usable + extra_occupied;
         if (diff < 0)
             diff = -diff;
         if (diff > byte_count) {
             /* allocs larger than cur freq occupy multiple snapshots */
             while (diff > byte_count) {
                 take_snapshot();
                 diff -= options.dump_freq;
             }
             byte_count = options.dump_freq;
         } else {
             byte_count -= diff;
         }
     } else if (options.time_allocs) {
         /* PR 545288: consider free as well as alloc */
         /* We rely on malloc lock being held by caller */
         allocfree_count--;
         if (allocfree_count <= 0) {
             take_snapshot();
             allocfree_count = options.dump_freq;
         }
     }
     allocfree_cur++;
 }

 static void
 dump_callstack(packed_callstack_t *pcs, per_callstack_t *per,
                char *buf, size_t bufsz, size_t *sofar)
 {
     ssize_t len = 0;
     /* we use a buffer for atomic prints even though malloc lock does
      * currently synchronize
      */
     BUFPRINT(buf, bufsz, *sofar, len, "CALLSTACK %u\n", per->id);
     packed_callstack_print(pcs, 0, buf, bufsz, sofar, NULL);
     BUFPRINT(buf, bufsz, *sofar, len, "%s", END_MARKER);
     print_buffer(f_callstack, buf);
 }

 /* A lock is held around the call to this routine */
 void *
 client_add_malloc_pre(malloc_info_t *info, dr_mcontext_t *mc, app_pc post_call)
 {
     void *drcontext = dr_get_current_drcontext();
     per_callstack_t *per;
     char *buf;
     size_t bufsz;
     size_t sofar = 0;
 #ifdef STATISTICS
     static uint malloc_count;
 #endif
     get_buffer(drcontext, &buf, &bufsz);
     if (info->client_data != NULL) {
         per = get_cstack_from_alloc_data(info->client_data);
         IF_DEBUG({
             hashtable_lock(&alloc_stack_table);
             ASSERT(hashtable_lookup(&alloc_stack_table,
                                     (void *)per->IF_MD5_ELSE(md5, crc)) == (void*)per,
                    "malloc re-add should still be in table");
             hashtable_unlock(&alloc_stack_table);
         });
     } else {
 #if defined(USE_MD5) || defined(CHECK_WITH_MD5)
         byte md5[MD5_RAW_BYTES];
 #endif
 #ifndef USE_MD5
         uint crc[2];
 #endif
         /* Printing to a buffer is slow (quite noticeable: 2x on cfrac) so it's
          * faster to create a packed callstack for computing the checksum to
          * decide uniqueness, limiting printing to new callstacks only.
          */
         packed_callstack_t *pcs;
         app_loc_t loc;
         pc_to_loc(&loc, post_call);
         packed_callstack_record(&pcs, mc, &loc, options.callstack_max_frames);

 #if defined(USE_MD5) || defined(CHECK_WITH_MD5)
         packed_callstack_md5(pcs, md5);
 #endif
 #ifndef USE_MD5
         packed_callstack_crc32(pcs, crc);
 #endif

         hashtable_lock(&alloc_stack_table);
         per = (per_callstack_t *)
             hashtable_lookup(&alloc_stack_table, (void *)IF_MD5_ELSE(md5, crc));
 #ifdef CHECK_WITH_MD5
         /* Check for collisions with crc32 */
         ASSERT(per == hashtable_lookup(&alloc_md5_table, (void *)md5),
                "crc and md5 do not agree");
 #endif
         if (per == NULL) {
             per = (per_callstack_t *) global_alloc(sizeof(*per), HEAPSTAT_CALLSTACK);
             memset(per, 0, sizeof(*per));
             /* we could do ++ since there's an outer lock */
             per->id = atomic_add32_return_sum((volatile int *)&num_callstacks, 1);
 #if defined(USE_MD5) || defined(CHECK_WITH_MD5)
             memcpy(per->md5, md5, BUFFER_SIZE_BYTES(md5));
             hashtable_add(IF_MD5_ELSE(&alloc_stack_table, &alloc_md5_table),
                           (void *)per->md5, (void *)per);
 #endif
 #ifndef USE_MD5
             per->crc[0] = crc[0];
             per->crc[1] = crc[1];
             hashtable_add(&alloc_stack_table, (void *)per->crc, (void *)per);
 #endif
             STATS_INC(alloc_stack_count);

             dump_callstack(pcs, per, buf, bufsz, &sofar);
         }
         hashtable_unlock(&alloc_stack_table);
         sofar = packed_callstack_free(pcs);
         ASSERT(sofar == 0, "pcs should have 0 ref count");
     }

 #ifdef X64
     /* FIXME: assert not truncating */
 #endif
     account_for_bytes_pre(per, info->request_size, info->pad_size, HEADER_SIZE, false);

 #ifdef STATISTICS
     if (((malloc_count++) % STATS_DUMP_FREQ) == 0)
         dump_statistics();
 #endif
     release_buffer(drcontext, buf, bufsz);

     if (options.staleness)
         return (void *) staleness_create_per_alloc(per, stamp);
     else
         return (void *) per;
 }

 void
 client_add_malloc_post(malloc_info_t *info)
 {
     /* take potential snapshots here once table is consistent (PR 567117) */
     account_for_bytes_post(info->request_size, info->pad_size, HEADER_SIZE);
 }

 /* A lock is held around the call to this routine */
 void
 client_remove_malloc_pre(malloc_info_t *info)
 {
     per_callstack_t *per = get_cstack_from_alloc_data(info->client_data);
 #ifdef X64
     /* FIXME: assert not truncating */
 #endif
     /* To avoid repeatedly redoing the peak snapshot we wait until a drop (PR 476018) */
     dr_mutex_lock(snapshot_lock);
     check_for_peak();
     dr_mutex_unlock(snapshot_lock);
     account_for_bytes_pre(per, -(ssize_t)info->request_size, -(ssize_t)info->pad_size,
                           -(ssize_t)(HEADER_SIZE), false);
     if (options.staleness)
         staleness_free_per_alloc((stale_per_alloc_t *)info->client_data);
 }

 void
 client_remove_malloc_post(malloc_info_t *info)
 {
     /* take potential snapshots here once table is consistent (PR 567117) */
     account_for_bytes_post(info->request_size, info->pad_size, HEADER_SIZE);
 }

 static void
 snapshot_init(void)
 {
     snapshot_lock = dr_mutex_create();

     snaps = (per_snapshot_t *)
         global_alloc(options.snapshots*sizeof(*snaps), HEAPSTAT_SNAPSHOT);
     memset(snaps, 0, options.snapshots*sizeof(*snaps));
 }

 /* Caller must hold malloc lock */
 static void
 snapshot_dump_all(void)
 {
     uint i;
     dr_mutex_lock(snapshot_lock);
     /* These should be sorted by stamp, but simpler to have the vis tool
      * sort them.
      */
     /* We do dump the partially-full current snapshot (PR 548013) */
     if (options.time_clock) {
         uint64 diff = ((dr_get_milliseconds() - timestamp_last_snapshot)
                        / TIME_BASE_FREQ) + 1 /* round up */;
         snaps[snap_idx].stamp = stamp + diff;
     } else if (options.time_allocs)
         snaps[snap_idx].stamp = stamp + (options.dump_freq - allocfree_count);
     else if (options.time_bytes)
         snaps[snap_idx].stamp = stamp + (options.dump_freq - byte_count);
     else if (options.time_instrs)
         snaps[snap_idx].stamp = stamp + (options.dump_freq - instr_count);
     /* Check for peak on every snapshot (PR 476018) */
     check_for_peak();
     dump_snapshot(&snap_peak, -1);
     if (snap_fills == 0) {
         for (i = 0; i <= snap_idx; i++) {
             dump_snapshot(&snaps[i], i);
         }
     } else {
         /* FIXME: sort by stamp */
         for (i = 0; i < options.snapshots; i++) {
             dump_snapshot(&snaps[i], i);
         }
     }
     dr_mutex_unlock(snapshot_lock);
 }

 static void
 snapshot_exit(void)
 {
     int i;

     snapshot_dump_all();

     for (i = 0; i < options.snapshots; i++)
         free_snapshot(&snaps[i]);
     global_free(snaps, options.snapshots*sizeof(*snaps), HEAPSTAT_SNAPSHOT);
     free_snapshot(&snap_peak);

     dr_mutex_destroy(snapshot_lock);
 }

 void
 client_handle_malloc(void *drcontext, malloc_info_t *info, dr_mcontext_t *mc)
 {
     if (options.check_leaks)
         leak_handle_alloc(drcontext, info->base, info->request_size);
 }

 void
 client_handle_realloc(void *drcontext, malloc_info_t *old_info, malloc_info_t *new_info,
                       bool for_reuse, dr_mcontext_t *mc)
 {
     /* We only need to act on an in-place realloc, as an out-of-place realloc
      * will have already called client_{remove,add}_malloc_{pre,post}.
      */
     if (new_info->base == old_info->base) {
         per_callstack_t *per = get_cstack_from_alloc_data(new_info->client_data);
         ssize_t delta_req = (ssize_t)new_info->request_size - old_info->request_size;
         ssize_t delta_pad = (ssize_t)new_info->pad_size - old_info->pad_size;
         ssize_t delta_head = 0;
         if (delta_req < 0) {
             /* Just like on a free we check for a drop in the peak */
             dr_mutex_lock(snapshot_lock);
             check_for_peak();
             dr_mutex_unlock(snapshot_lock);
         }
         account_for_bytes_pre(per, delta_req, delta_pad, delta_head, true/*realloc*/);
         account_for_bytes_post(delta_req, delta_pad, 0);
     }

     if (options.check_leaks)
         leak_handle_alloc(drcontext, new_info->base, new_info->request_size);
 }

 void
 client_handle_alloc_failure(size_t request_size, app_pc pc, dr_mcontext_t *mc)
 {
 }

 void
 client_handle_realloc_null(app_pc pc, dr_mcontext_t *mc)
 {
 }

 /* Returns the value to pass to free().  Return "real_base" for no change.
  * The Windows heap param is INOUT so it can be changed as well.
  */
 app_pc
 client_handle_free(malloc_info_t *info, byte *tofree, dr_mcontext_t *mc,
                    app_pc free_routine, void *routine_set_data, bool for_reuse
                    _IF_WINDOWS(ptr_int_t *auxarg INOUT))
 {
     return tofree;
 }

 void
 client_handle_free_reuse(void *drcontext, malloc_info_t *info, dr_mcontext_t *mc)
 {
     /* nothing */
 }

 void
 client_new_redzone(app_pc start, size_t size)
 {
     /* nothing */
 }

 void
 client_invalid_heap_arg(app_pc pc, app_pc target, dr_mcontext_t *mc, const char *routine,
                         bool is_free)
 {
     /* not reporting today */
     LOG(1, "invalid heap arg\n");
 }

 void
 client_mismatched_heap(app_pc pc, app_pc target, dr_mcontext_t *mc,
                        const char *alloc_routine, const char *free_routine,
                        const char *action, void *client_data, bool C_vs_CPP)
 {
     /* not reporting today */
     LOG(1, "mismatched heap routines\n");
 }

 void
 client_handle_mmap(void *drcontext, app_pc base, size_t size, bool anon)
 {
 }

 void
 client_handle_munmap(app_pc base, size_t size, bool anon)
 {
 }

 void
 client_handle_munmap_fail(app_pc base, size_t size, bool anon)
 {
 }

 #ifdef UNIX
 void
 client_handle_mremap(app_pc old_base, size_t old_size, app_pc new_base, size_t new_size,
                      bool image)
 {
 }
 #endif

 void *
 client_add_malloc_routine(app_pc pc)
 {
     return NULL;
 }

 void
 client_remove_malloc_routine(void *client_data)
 {
 }

 #ifdef WINDOWS
 void
 client_handle_heap_destroy(void *drcontext, HANDLE heap,
                            void *client_data)
 {
 }

 void
 client_remove_malloc_on_destroy(HANDLE heap, byte *start, byte *end)
 {
     if (options.check_leaks)
         leak_remove_malloc_on_destroy(heap, start, end);
 }
 #endif /* WINDOWS */

 void
 client_stack_alloc(byte *start, byte *end, bool defined)
 {
 }

 void
 client_stack_dealloc(byte *start, byte *end)
 {
 }

 bool
 client_write_memory(byte *start, size_t size, dr_mcontext_t *mc)
 {
     return true;
 }

 bool
 client_read_memory(byte *start, size_t size, dr_mcontext_t *mc)
 {
     return true;
 }

 #ifdef DEBUG
 void
 client_print_callstack(void *drcontext, dr_mcontext_t *mc, app_pc pc)
 {
     print_callstack_to_file(drcontext, mc, pc, f_global, options.callstack_max_frames);
 }
 #endif

 void
 client_pre_syscall(void *drcontext, int sysnum)
 {
 }

 void
 client_post_syscall(void *drcontext, int sysnum)
 {
 }

 void
 client_found_leak(app_pc start, app_pc end, size_t indirect_bytes,
                   bool pre_us, bool reachable,
                   bool maybe_reachable, void *client_data,
                   bool count_reachable, bool show_reachable)
 {
     /* XXX i#926: we now have online symbolization of leak callstacks, but
      * we're re-symbolizing in postleaks.pl until we can produce a proper
      * error report here including suppression checking.  We should refactor
      * Dr. Memory leak reporting (and checking) into a library for sharing.
      */
     per_callstack_t *per = get_cstack_from_alloc_data(client_data);
     ssize_t len = 0;
     size_t sofar = 0;
     char *buf;
     size_t bufsz;
     void *drcontext = dr_get_current_drcontext();
     int num;

     ASSERT(options.check_leaks, "leak checking error");
     if (pre_us && options.ignore_early_leaks)
         return;
     if (reachable) {
         if (count_reachable)
             ATOMIC_INC32(reachable_leak_count);
         if (!show_reachable)
             return;
     }
     if (maybe_reachable && !options.possible_leaks)
         return;

     num = atomic_add32_return_sum((volatile int *)&leak_count, 1);
     get_buffer(drcontext, &buf, &bufsz);
     BUFPRINT(buf, bufsz, sofar, len, "Error #%d: ", num);
     if (reachable)
         BUFPRINT(buf, bufsz, sofar, len, "REACHABLE ");
     else if (maybe_reachable)
         BUFPRINT(buf, bufsz, sofar, len, "POSSIBLE ");
     BUFPRINT(buf, bufsz, sofar, len,
              "LEAK %d direct bytes "PFX"-"PFX" + %d indirect bytes"
              "\n\tcallstack=%d\n\terror end\n",
              (end - start), start, end, indirect_bytes, per->id);
     print_buffer(f_global, buf);
     release_buffer(drcontext, buf, bufsz);
 }

 /***************************************************************************
  * INSTRUMENTATION
  */

 /* N.B.: mcontext is not in consistent app state, for efficiency.
  */
 static void
 shared_instrcnt_callee(void)
 {
     bool do_snapshot = false;
     ASSERT(options.time_instrs, "option mismatch");
     /* We racily subtract and check, so serialize now for a single snapshot */
     dr_mutex_lock(snapshot_lock);
     /* We can still have a double-snapshot if threshold is low enough that
      * other threads bump the instr_count over it before 2nd guy to get lock
      * can do this check, but that's ok
      */
     if (instr_count < 0) {
         do_snapshot = true;
         instr_count = options.dump_freq;
     } else {
         /* We assume can't take so long that it wraps around */
         ASSERT(instr_count <= options.dump_freq, "callee incorrectly invoked");
     }
     /* Release lock so take_snapshot can grab it */
     dr_mutex_unlock(snapshot_lock);
     if (do_snapshot)
         take_snapshot();
 }

 /* To avoid the code expansion from a clean call in every bb we use
  * a shared clean call sequence, entering it with a direct jump
  * and exiting with an indirect jump to a stored return point.
  */
 static app_pc
 generate_shared_callout(void *drcontext, instrlist_t *ilist, app_pc pc)
 {
     /* On entry:
      *   - SPILL_SLOT_2 holds the return address
      * The spill slots are persistent storage across clean calls.
      */
     dr_insert_clean_call(drcontext, ilist, NULL,
                          (void *) shared_instrcnt_callee, false, 0);
     PRE(ilist, NULL,
         INSTR_CREATE_jmp_ind(drcontext,
                              dr_reg_spill_slot_opnd(drcontext, SPILL_SLOT_2)));

     shared_instrcnt_callout = pc;
     pc = instrlist_encode(drcontext, ilist, pc, false);
     instrlist_clear(drcontext, ilist);
     return pc;
 }

 static void
 create_shared_code(void)
 {
     void *drcontext = dr_get_current_drcontext();
     byte *pc;
     IF_DEBUG(bool ok;)
     instrlist_t *ilist = instrlist_create(drcontext);

     shared_code_region = (byte *)
         nonheap_alloc(SHARED_CODE_SIZE,
                       DR_MEMPROT_READ|DR_MEMPROT_WRITE|DR_MEMPROT_EXEC,
                       HEAPSTAT_GENCODE);

     pc = shared_code_region;
     pc = generate_shared_callout(drcontext, ilist, pc);
     ASSERT(pc - shared_code_region <= SHARED_CODE_SIZE, "shared code region too small");

     if (options.staleness) {
         pc = generate_shared_slowpath(drcontext, ilist, pc);
         ASSERT(pc - shared_code_region <= SHARED_CODE_SIZE,
                "shared code region too small");
     }

     if (options.check_leaks) {
         pc = generate_shared_esp_slowpath(drcontext, ilist, pc);
         ASSERT(pc - shared_code_region <= SHARED_CODE_SIZE,
                "shared code region too small");
     }

     instrlist_clear_and_destroy(drcontext, ilist);

     /* now mark as +rx (non-writable) */
     IF_DEBUG(ok = )
         dr_memory_protect(shared_code_region, SHARED_CODE_SIZE,
                           DR_MEMPROT_READ|DR_MEMPROT_EXEC);
     ASSERT(ok, "-w failed on shared routines gencode");

     DOLOG(2, {
         byte *end_pc = pc;
         pc = shared_code_region;
         LOG(2, "shared_code region:\n");
         while (pc < end_pc) {
             pc = disassemble_with_info(drcontext, pc, f_global,
                                        true/*show pc*/, true/*show bytes*/);
         }
     });

 }

 static void
 free_shared_code(void)
 {
     nonheap_free(shared_code_region, SHARED_CODE_SIZE, HEAPSTAT_GENCODE);
 }

 static void
 insert_instr_counter(void *drcontext, instrlist_t *bb,
                      instr_t *first, bool flags_dead, instr_t *where_dead,
                      uint instrs_in_bb)
 {
     instr_t *where = (where_dead == NULL) ? first : where_dead;
     instr_t *done = INSTR_CREATE_label(drcontext);
     if (!flags_dead)
         dr_save_arith_flags(drcontext, bb, first, SPILL_SLOT_1);
     /* Rather than an ongoing count that would need a 64-bit
      * counter, we do a racy subtract of a 32-bit counter and if
      * negative (so we don't need a cmp) then we go to a callout
      * that synchs for a counter reset and single snapshot.
      * We assume that racy mods during that synch won't overflow
      * the counter.  We also ignore the detail of how many instrs
      * in this bb we've executed yet.
      */
     instrlist_meta_preinsert
         (bb, where, INSTR_CREATE_sub(drcontext, OPND_CREATE_ABSMEM
                                      ((byte *)&instr_count, OPSZ_4),
                                      (instrs_in_bb <= CHAR_MAX) ?
                                      OPND_CREATE_INT8(instrs_in_bb) :
                                      OPND_CREATE_INT32(instrs_in_bb)));
     /* TODO: for better perf could not bother to check threshold
      * in every bb: but don't want to skip for single-bb-loop-body
      */
     instrlist_meta_preinsert
         (bb, where, INSTR_CREATE_jcc(drcontext, OP_jns_short,
                                      opnd_create_instr(done)));
     /* To avoid the code expansion from a clean call in every bb we use
      * a shared clean call sequence, entering it with a direct jump
      * and exiting with an indirect jump to a stored return point.
      */
     /* Get return point into SPILL_SLOT_2.  Spill reg, mov imm to reg, and then
      * xchg reg and slot isn't any faster, right?
      */
     instrlist_meta_preinsert
         (bb, where, INSTR_CREATE_mov_st
          (drcontext, dr_reg_spill_slot_opnd(drcontext, SPILL_SLOT_2),
           opnd_create_instr(done)));
     instrlist_meta_preinsert
         (bb, where, INSTR_CREATE_jmp(drcontext,
                                      opnd_create_pc(shared_instrcnt_callout)));
     /* Will return here */
     instrlist_meta_preinsert(bb, where, done);
     if (!flags_dead)
         dr_restore_arith_flags(drcontext, bb, first, SPILL_SLOT_1);
 }

 static dr_emit_flags_t
 event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb,
                          bool for_trace, bool translating, void **user_data OUT)
 {
     /* we pass bi among all 4 phases */
     instru_info_t *ii = thread_alloc(drcontext, sizeof(*ii), HEAPSTAT_PERBB);
     memset(ii, 0, sizeof(*ii));
     *user_data = (void *) ii;
     return DR_EMIT_DEFAULT;
 }

 static dr_emit_flags_t
 event_bb_analysis(void *drcontext, void *tag, instrlist_t *bb,
                          bool for_trace, bool translating, void *user_data)
 {
     instru_info_t *ii = (instru_info_t *) user_data;
     DOLOG(3, instrlist_disassemble(drcontext, tag, bb, LOGFILE_GET(drcontext)););
 #ifdef USE_DRSYMS
     DOLOG(3, {
         char buf[128];
         size_t sofar = 0;
         ssize_t len;
         if (!translating) {
             BUFPRINT(buf, BUFFER_SIZE_ELEMENTS(buf), sofar, len,
                      "new basic block @"PFX" ==", tag);
             print_symbol(tag, buf, BUFFER_SIZE_ELEMENTS(buf), &sofar, false, 0);
             LOG(1, "%s\n", buf);
         }
     });
 #endif
     if (options.staleness)
         fastpath_top_of_bb(drcontext, tag, bb, &ii->bi);
     return DR_EMIT_DEFAULT;
 }

 static dr_emit_flags_t
 event_bb_insert(void *drcontext, void *tag, instrlist_t *bb, instr_t *inst,
                        bool for_trace, bool translating, void *user_data)
 {
     instru_info_t *ii = (instru_info_t *) user_data;
     uint flags;
     fastpath_info_t mi;

     if (instr_is_meta(inst))
         return DR_EMIT_DEFAULT;

     /* i#2402: Temporarily disable auto predication globally due to poor
      * interaction with internal control flow we emit.
      */
     drmgr_disable_auto_predication(drcontext, bb);

     ii->instrs_in_bb++;

     if (options.time_instrs) {
         /* See if flags are dead anywhere.
          * FIXME: for fault handling we should either consider a faultable
          * instr as having live flags or recover somehow: ignoring for now
          * as pathological.
          */
         flags = instr_get_arith_flags(inst, DR_QUERY_DEFAULT);
         /* We insert after the prev instr to avoid messing up
          * -check_leaks instrumentation (PR 560871)
          */
         if (TESTALL(EFLAGS_WRITE_6, flags) && !TESTANY(EFLAGS_READ_6, flags)) {
             ii->where_dead = instr_get_prev(inst);
             ii->flags_dead = true;
         }
     }

     if (options.staleness) {
         /* We want to spill AFTER any clean call in case it changes mcontext */
         ii->bi.spill_after = instr_get_prev(inst);

         /* update liveness of whole-bb spilled regs */
         fastpath_pre_instrument(drcontext, bb, inst, &ii->bi);

         if (instr_uses_memory_we_track(inst)) {
             if (instr_ok_for_instrument_fastpath(inst, &mi, &ii->bi)) {
                 instrument_fastpath(drcontext, bb, inst, &mi, false);
                 ii->bi.added_instru = true;
             } else {
                 LOG(3, "fastpath unavailable "PFX": ", instr_get_app_pc(inst));
                 DOLOG(3, { instr_disassemble(drcontext, inst,
                                              LOGFILE_GET(drcontext)); });
                 LOG(3, "\n");
                 ii->bi.shared_memop = opnd_create_null();
                 /* Restore whole-bb spilled regs (PR 489221)
                  * FIXME: optimize via liveness analysis
                  */
                 mi.reg1 = ii->bi.reg1;
                 mi.reg2 = ii->bi.reg2;
                 memset(&mi.reg3, 0, sizeof(mi.reg3));
                 instrument_slowpath(drcontext, bb, inst,
                                     whole_bb_spills_enabled() ? &mi : NULL);
                 /* for whole-bb slowpath does interact w/ global regs */
                 ii->bi.added_instru = whole_bb_spills_enabled();
             }
         }
     }

     if (ZERO_STACK() && instr_writes_esp(inst)) {
         /* any new spill must be after the alloc instru */
         ii->bi.spill_after = instr_get_prev(inst);
         /* we zero for leaks, and staleness does not care about xsp */
         instrument_esp_adjust(drcontext, bb, inst, &ii->bi,
                               SP_ADJUST_ACTION_ZERO);
         ii->bi.added_instru = true;
     }

     if (options.staleness)
         fastpath_pre_app_instr(drcontext, bb, inst, &ii->bi, &mi);

     return DR_EMIT_DEFAULT;
 }

 static dr_emit_flags_t
 event_bb_instru2instru(void *drcontext, void *tag, instrlist_t *bb,
                        bool for_trace, bool translating, void *user_data)
 {
     instru_info_t *ii = (instru_info_t *) user_data;
     if (options.staleness) {
         fastpath_bottom_of_bb(drcontext, tag, bb, &ii->bi, ii->bi.added_instru,
                               translating, false);
     }
     if (options.time_instrs) {
         insert_instr_counter(drcontext, bb, instrlist_first(bb), ii->flags_dead,
                              /* insert after the prev instr to avoid messing up
                               * -check_leaks instrumentation (PR 560871)
                               */
                              (ii->where_dead == NULL) ?
                              instrlist_first(bb) : instr_get_next(ii->where_dead),
                              ii->instrs_in_bb);
     }

     LOG(3, "final instrumentation:\n");
     DOLOG(3, instrlist_disassemble(drcontext, tag, bb, LOGFILE_GET(drcontext)););

     thread_free(drcontext, ii, sizeof(*ii), HEAPSTAT_PERBB);
     return DR_EMIT_DEFAULT; /* deterministic */
 }

 /***************************************************************************
  * DYNAMORIO EVENTS & TOP-LEVEL CODE
  */

 #ifdef STATISTICS
 /* statistics
  * FIXME: make per-thread to avoid races (or use locked inc)
  * may want some of these to be 64-bit
  */
 static void
 dump_statistics(void)
 {
     int i;
     dr_fprintf(f_global, "Statistics:\n");
     dr_fprintf(f_global, "app mallocs: %8u, frees: %8u, large mallocs; %6u\n",
                num_mallocs, num_frees, num_large_mallocs);
     dr_fprintf(f_global, "unique malloc stacks: %8u\n", alloc_stack_count);
     dr_fprintf(f_global, "app heap regions: %8u\n", heap_regions);
     dr_fprintf(f_global, "peaks detected: %8u, skipped: %8u\n",
                peaks_detected, peaks_skipped);
     if (options.staleness) {
         dr_fprintf(f_global, "staleness: needs large: %7u, needs ext: %7u\n",
                    stale_needs_large, stale_small_needs_ext);
     }

     /* FIXME: share w/ drmemory.c */
     dr_fprintf(f_global, "\nPer-opcode slow path executions:\n");
     for (i = 0; i <= OP_LAST; i++) {
         if (slowpath_count[i] > 0) {
             dr_fprintf(f_global, "\t%3u %10s: %12"UINT64_FORMAT_CODE"\n",
                        i, decode_opcode_name(i), slowpath_count[i]);
         }
     }

     heap_dump_stats(f_global);
 }
 #endif /* STATISTICS */

 #if DEBUG
 /* drsyscall debug build uses this */
 void
 report_callstack(void *drcontext, dr_mcontext_t *mc)
 {
 }
 #endif /* DEBUG */

 static void
 client_heap_add(app_pc start, app_pc end, dr_mcontext_t *mc)
 {
     LOG(2, "%s "PFX"-"PFX"\n", __FUNCTION__, start, end);
     if (options.staleness)
         shadow_create_shadow_memory(start, end, 0);
 }

 static void
 client_heap_remove(app_pc start, app_pc end, dr_mcontext_t *mc)
 {
     /* save memory and improve performance by putting back the specials --
      * except w/ our sideline thread doing sweeps we could have races
      * (unlikely since this is a heap region remove) so it's not safe.
      * we just live w/ the extra mem and writes.
      */
     LOG(2, "%s "PFX"-"PFX"\n", __FUNCTION__, start, end);
 #if 0 /* disabled: see comment above */
     if (options.staleness)
         shadow_reinstate_specials_in_range(start, end);
 #endif
 }

 static void
 heap_iter_region(app_pc start, app_pc end _IF_WINDOWS(HANDLE heap))
 {
     client_heap_add(start, end, NULL);
     heap_region_add(start, end, HEAP_PRE_US | HEAP_ARENA, NULL);
 }

 static void
 heap_iter_chunk(app_pc start, app_pc end)
 {
     /* We don't have the asked-for size so we use real end for both */
     malloc_add(start, end, end, true/*pre_us*/, 0, NULL, NULL);
 }

 /* Walk the heap blocks that are already allocated at client init time */
 static void
 heap_walk(void)
 {
     heap_iterator(heap_iter_region, heap_iter_chunk _IF_WINDOWS(NULL));
 }

 /* if which_thread is >= 0, creates a file name with which_thread and
  * the cur thread's tid
  */
 static file_t
 open_logfile(const char *name, bool pid_log, int which_thread)
 {
     file_t f;
     char logname[MAXIMUM_PATH];
     IF_DEBUG(int len;)
     uint extra_flags = IF_UNIX_ELSE(DR_FILE_ALLOW_LARGE, 0);
     ASSERT(logsubdir[0] != '\0', "logsubdir not set up");
     if (pid_log) {
         IF_DEBUG(len = )
             dr_snprintf(logname, BUFFER_SIZE_ELEMENTS(logname),
                         "%s%c%s.%d.log", logsubdir, DIRSEP, name, dr_get_process_id());
     } else if (which_thread >= 0) {
         IF_DEBUG(len = )
             dr_snprintf(logname, BUFFER_SIZE_ELEMENTS(logname),
                         "%s%c%s.%d.%d.log", logsubdir, DIRSEP, name,
                           which_thread, dr_get_thread_id(dr_get_current_drcontext()));
         /* have DR close on fork so we don't have to track and iterate */
         extra_flags |= DR_FILE_CLOSE_ON_FORK;
     } else {
         IF_DEBUG(len = )
             dr_snprintf(logname, BUFFER_SIZE_ELEMENTS(logname),
                         "%s%c%s", logsubdir, DIRSEP, name);
     }
     ASSERT(len > 0, "logfile name buffer max reached");
     NULL_TERMINATE_BUFFER(logname);
     f = dr_open_file(logname, DR_FILE_WRITE_OVERWRITE | extra_flags);
     ASSERT(f != INVALID_FILE, "unable to open log file");
     if (which_thread > 0) {
         void *drcontext = dr_get_current_drcontext();
         dr_log(drcontext, LOG_ALL, 1,
                "DrMemory: log for thread "TIDFMT" is %s\n",
                dr_get_thread_id(drcontext), logname);
         NOTIFY("thread logfile is %s"NL, logname);
     }
     return f;
 }

 /* also initializes logsubdir */
 static void
 create_global_logfile(void)
 {
     uint count = 0;
     const char *appnm = dr_get_application_name();
     const uint LOGDIR_TRY_MAX = 1000;
     /* PR 408644: pick a new subdir inside base logdir */
     /* PR 453867: logdir must have pid in its name */
     do {
         dr_snprintf(logsubdir, BUFFER_SIZE_ELEMENTS(logsubdir),
                     "%s%cDrHeapstat-%s.%d.%03d",
                     options.logdir, DIRSEP, appnm == NULL ? "null" : appnm,
                     dr_get_process_id(), count);
         NULL_TERMINATE_BUFFER(logsubdir);
         /* FIXME PR 514092: if the base logdir is unwritable, we shouldn't loop
          * UINT_MAX times: it looks like we've hung.
          * Unfortuantely dr_directory_exists() is Windows-only and
          * dr_create_dir returns only a bool, so for now we just
          * fail if we hit 1000 dirs w/ same pid.
          */
     } while (!dr_create_dir(logsubdir) && ++count < LOGDIR_TRY_MAX);
     if (count >= LOGDIR_TRY_MAX) {
         NOTIFY_ERROR("Unable to create subdir in log base dir %s"NL, options.logdir);
         ASSERT(false, "unable to create unique logsubdir");
         dr_abort();
     }

     f_global = open_logfile("global", true/*pid suffix*/, -1);
 #ifdef UNIX
     /* make it easier for wrapper script to find this logfile */
     dr_fprintf(f_global, "process=%d, parent=%d\n",
                dr_get_process_id(), dr_get_parent_id());
 #endif
     /* make sure "Dr. Heapstat" is 1st (or 2nd on linux) in file (for PR 453867) */
     dr_fprintf(f_global, "Dr. Heapstat version %s\n", VERSION_STRING);
     NOTIFY("log dir is %s"NL, logsubdir);
     LOGF(1, f_global, "running %s\n",
          (dr_get_application_name() == NULL) ? "<null>" : dr_get_application_name());
     LOGF(1, f_global, "global logfile fd=%d\n", f_global);

     f_callstack = open_logfile("callstack.log", false, -1);
     f_snapshot = open_logfile("snapshot.log", false, -1);
     if (options.staleness)
         f_staleness = open_logfile("staleness.log", false, -1);

     /* For long running multi-process apps like sfcbd, this can mean a lot of
      * index files.  With each file being 1 MB minimum on esxi, space can be
      * used up fast.  On ther other hand computing nudge index in postprocess
      * each time can be time consuming with large number of snapshots, thereby
      * affecting the vistool startup time.  No simple solution.  If the file
      * problem gets out of hand, we might use the global log, but then that
      * requires parsing the global log, which can get large.
      */
     f_nudge = open_logfile("nudge.idx", false, -1);
     dr_fprintf(f_nudge, "%s snapshots\n", options.dump ? "variable" : "constant");
 }

 static void
 close_file(file_t f)
 {
     /* with DRi#357, DR now isolates log files so little to do here */
     dr_close_file(f);
 }

 #define dr_close_file DO_NOT_USE_dr_close_file

 static void
 reset_to_time_zero(bool keep_offs)
 {
     int i;
     dr_mutex_lock(snapshot_lock);

     /* take current data and make it the cur val of to-be-snapshot 0 */
     if (snap_idx != 0)
         copy_snapshot(&snaps[0], &snaps[snap_idx], true/*live copy*/);
     for (i = 1; i < options.snapshots; i++)
         free_snapshot(&snaps[i]);
     memset(&snaps[1], 0, (options.snapshots-1)*sizeof(*snaps));

     if (keep_offs)
         stamp_offs = stamp;
     else
         reset_intervals();
     stamp = 0;
     snap_idx = 0;
     /* malloc_count we do not reset */

     dr_mutex_unlock(snapshot_lock);
 }

 static void
 event_timer(void *drcontext, dr_mcontext_t *mcontext)
 {
     bool alarm_clock = false, alarm_stale = false;
     if (sideline_exit) {
 #ifdef UNIX
         dr_set_itimer(ITIMER_REAL, 0, event_timer);
 #endif
        return;
     }
     if (options.time_clock) {
         ASSERT(timer_real <= timer_clock, "timer internal error");
         timer_clock -= timer_real;
         if (timer_clock == 0) {
             alarm_clock = true;
             timer_clock = options.dump_freq*TIME_BASE_FREQ;
         }
     }
     if (options.staleness) {
         ASSERT(timer_real <= timer_stale, "timer internal error");
         timer_stale -= timer_real;
         if (timer_stale == 0) {
             alarm_stale = true;
             /* FIXME PR 553724: should -stale_granularity be increased as
              * -dump_freq increases?  For a long-running app it seems fine to
              * make the staleness data coarser and coarser, which also improves
              * performance.
             */
             timer_stale = options.stale_granularity;
         }
     }
     /* reset now before potentially taking a while in code below, to avoid
      * clock drift
      */
     reset_real_timer();

     if (alarm_clock) {
         timestamp_last_snapshot = dr_get_milliseconds();
         /* Must hold malloc lock first.  We set alloc_ops.global_lock, so this
          * will synchronize w/ all allocations and frees.
          */
         malloc_lock();
         take_snapshot();
         malloc_unlock();
     }
     if (alarm_stale) {
         staleness_sweep(stamp);
     }
 }

 static void
 reset_real_timer(void)
 {
     /* set real timer to the smaller and when it fires we'll adjust both */
     if (options.staleness) {
         if (options.time_clock) {
             timer_real = (timer_clock > 0 && timer_clock < timer_stale) ?
                 timer_clock : timer_stale;
         } else
             timer_real = timer_stale;
     } else {
         ASSERT(options.time_clock, "timer should not be active");
         timer_real = timer_clock;
     }
     ASSERT(timer_real >= 0, "timer internal error");
 #ifdef UNIX
     if (!dr_set_itimer(ITIMER_REAL, timer_real, event_timer))
         ASSERT(false, "unable to set up timer callback\n");
 #endif
 }

 static void
 reset_clock_timer(void)
 {
     if (options.time_clock)
         timer_clock = options.dump_freq*TIME_BASE_FREQ;
     reset_real_timer();
 }

 /* For -time_clock we need a timer (PR 476008): simplest to use a separate thread */
 static void
 sideline_run(void *arg)
 {
     void *drcontext = dr_get_current_drcontext();

     /* PR 609569: we spend a lot of time holding locks, so keep running
      * during synchall.  Our locks keep us safe wrt leak scan, and we
      * do not mutate any app state or non-persistent DR state.
      */
     dr_client_thread_set_suspendable(false);

     if (options.thread_logs) {
         /* easier to debug w/ all staleness and clock snaps in sep file */
         /* FIXME i#297: have to use global heap for sideline thread */
         sideline_pt = global_alloc(sizeof(*sideline_pt), HEAPSTAT_MISC);
         memset(sideline_pt, 0, sizeof(*sideline_pt));
         /* store it in the slot provided in the drcontext */
         drmgr_set_tls_field(drcontext, tls_idx_util, (void *)sideline_pt);
         utils_thread_set_file(drcontext, open_logfile("sideline.log", false, -1));
     }
     if (options.staleness)
         timer_stale = options.stale_granularity;
     reset_clock_timer();
     ASSERT(options.time_clock || options.staleness, "thread should not be running");
     LOG(1, "sideline thread "TIDFMT" running\n", dr_get_thread_id(drcontext));
     while (!sideline_exit) {
 #ifdef WINDOWS
         dr_sleep(timer_real);
         /* FIXME: check wall-clock time and normalize to get closer to real time */
         event_timer(drcontext, NULL);
 #else
         dr_sleep(500);
 #endif
     }
 #ifdef UNIX
     dr_set_itimer(ITIMER_REAL, 0, event_timer);
 #endif
     /* i#297: we can't clean up sideline_pt so event_exit does it */
 }

 #ifdef UNIX
 static void
 event_fork(void *drcontext)
 {
     /* we want a whole new log dir to avoid clobbering the parent's */
     tls_heapstat_t *pt = (tls_heapstat_t *)
         drmgr_get_tls_field(drcontext, tls_idx_heapstat);
     /* fds are shared across fork so we must duplicate */
     file_t f_parent_callstack = dr_dup_file_handle(f_callstack);
     /* we assume no lock is needed since only one thread post-fork */
     static char buf[4096];

     close_file(f_global);
     close_file(f_callstack);
     close_file(f_snapshot);
     if (options.staleness)
         close_file(f_staleness);
     close_file(f_nudge);
     /* now create new files for all 5 */
     create_global_logfile();
     utils_thread_set_file(drcontext, f_global);
     LOG(0, "new logfile after fork fd=%d\n", f_global);

     /* we don't expect the user to go find the parent's data,
      * so we need to duplicate the callstack file and start
      * the snapshots over.  we no longer keep full callstacks
      * in memory (PR 496304) so we copy the file.
      * so we don't have to parse and find id# num_callstacks we
      * store the file position.
      */
     if (dr_file_seek(f_parent_callstack, 0, DR_SEEK_SET)) {
         int64 curpos = 0;
         IF_DEBUG(ssize_t sz;)
         LOG(1, "copying parent callstacks "INT64_FORMAT_STRING" bytes\n", pt->filepos);
         while (curpos + sizeof(buf) <= pt->filepos) {
             IF_DEBUG(sz = )
                 dr_read_file(f_parent_callstack, buf, sizeof(buf));
             ASSERT(sz == sizeof(buf), "error reading parent callstack data");
             IF_DEBUG(sz = )
                 dr_write_file(f_callstack, buf, sizeof(buf));
             ASSERT(sz == sizeof(buf), "error writing parent callstack data");
             curpos += sizeof(buf);
         }
         ASSERT(pt->filepos - curpos < sizeof(buf), "buf calc error");
         IF_DEBUG(sz = )
             dr_read_file(f_parent_callstack, buf, pt->filepos - curpos);
         ASSERT(sz == pt->filepos - curpos, "error reading parent callstack data");
         IF_DEBUG(sz = )
             dr_write_file(f_callstack, buf, pt->filepos - curpos);
         ASSERT(sz == pt->filepos - curpos, "error writing parent callstack data");
     } else
         LOG(0, "ERROR: unable to copy parent callstack file\n");
     close_file(f_parent_callstack);

     reset_to_time_zero(false/*start time over*/);
 }
 #endif

 #ifdef UNIX
 dr_signal_action_t
 event_signal(void *drcontext, dr_siginfo_t *info)
 {
     if (info->sig == SIGSEGV && ZERO_STACK() &&
         handle_zeroing_fault(drcontext, info->access_address, info->raw_mcontext,
                              info->mcontext)) {
         return DR_SIGNAL_SUPPRESS;
     }
     return DR_SIGNAL_DELIVER;
 }
 #else
 bool
 event_exception(void *drcontext, dr_exception_t *excpt)
 {
     if (excpt->record->ExceptionCode == STATUS_ACCESS_VIOLATION) {
         app_pc target = (app_pc) excpt->record->ExceptionInformation[1];
         if (ZERO_STACK() &&
             excpt->record->ExceptionInformation[0] == 1 /* write */ &&
             handle_zeroing_fault(drcontext, target, excpt->raw_mcontext,
                                  excpt->mcontext)) {
             return false;
         }
     }
     return true;
 }
 #endif

 bool
 event_restore_state_nop(void *drcontext, bool restore_memory,
                         dr_restore_state_info_t *info)
 {
     /* nothing: just here to avoid DR warning on zeroing loop faulting instrs */
     return true;
 }

 static bool
 event_filter_syscall(void *drcontext, int sysnum)
 {
     switch (sysnum) {
 #ifdef UNIX
     case SYS_close:
     case SYS_fork:
     case SYS_clone:
     IF_VMX86(case 1025:)
         return true;
 #endif
     default:
         return alloc_syscall_filter(drcontext, sysnum);
     }
 }

 static bool
 event_pre_syscall(void *drcontext, int sysnum)
 {
     tls_heapstat_t *pt = (tls_heapstat_t *)
         drmgr_get_tls_field(drcontext, tls_idx_heapstat);
     dr_mcontext_t *mc;
     if (drsys_get_mcontext(drcontext, &mc) != DRMF_SUCCESS)
         ASSERT(false, "drsys_get_mcontext failed");

     DOLOG(2, {
         drsys_sysnum_t sysnum_full;
         drsys_syscall_t *syscall;
         const char *name;
         if (drsys_cur_syscall(drcontext, &syscall) != DRMF_SUCCESS)
             ASSERT(false, "shouldn't fail");
         if (drsys_syscall_number(syscall, &sysnum_full) != DRMF_SUCCESS)
             ASSERT(false, "drsys_get_sysnum failed");
         if (drsys_syscall_name(syscall, &name) != DRMF_SUCCESS)
             ASSERT(false, "drsys_syscall_name failed");
         LOG(2, "system call #%d==%d.%d %s\n", sysnum,
             sysnum_full.number, sysnum_full.secondary, name);
     });

     handle_pre_alloc_syscall(drcontext, sysnum, mc);

 #ifdef UNIX
     if (sysnum == SYS_fork ||
         (sysnum == SYS_clone &&
          !TEST(CLONE_VM, (uint) dr_syscall_get_param(drcontext, 0)))
         /* FIXME: if open-sourced we should split this.
          * Presumably we'll have the bora shlib split before then.
          */
         IF_VMX86(|| sysnum == 1025)) {
         /* Store the file offset in the client_data field, shared across callbacks */
         pt->filepos = dr_file_tell(f_callstack);
         LOG(1, "SYS_fork: callstack file @ "INT64_FORMAT_STRING"\n", pt->filepos);
     }
 #endif
     return true; /* execute syscall */
 }

 static void
 event_post_syscall(void *drcontext, int sysnum)
 {
     dr_mcontext_t *mc;
     if (drsys_get_mcontext(drcontext, &mc) != DRMF_SUCCESS)
         ASSERT(false, "drsys_get_mcontext failed");
     handle_post_alloc_syscall(drcontext, sysnum, mc);
 }

 void
 check_reachability(bool at_exit)
 {
     if (options.check_leaks) {
         void *drcontext = dr_get_current_drcontext();
         ssize_t len = 0;
         size_t sofar = 0;
         char *buf;
         size_t bufsz;
         leak_scan_for_leaks(at_exit);
         get_buffer(drcontext, &buf, &bufsz);
         BUFPRINT(buf, bufsz, sofar, len,
                  "ERRORS IGNORED:\n  %5d still-reachable allocation(s)\n",
                  reachable_leak_count);
         if (!options.show_reachable) {
             BUFPRINT(buf, bufsz, sofar, len,
                      "         (re-run with \"-check_leaks -show_reachable\""
                      " for details)\n");
         }
         print_buffer(f_global, buf);
         release_buffer(drcontext, buf, bufsz);
     }
 }

 static void
 print_nudge_header(file_t f)
 {
     dr_fprintf(f, "NUDGE @ %16"INT64_FORMAT"u %s\n\n",
                snaps[snap_idx].stamp, unit_name());
 }

 static void
 event_nudge(void *drcontext, uint64 argument)
 {
     uint64 snapshot_fpos = 0;
     uint64 staleness_fpos = 0;

     /* PR 476043: use nudge to output snapshots for daemon app.  For
      * now we have only one use, so we don't need the argument, but we
      * may want an option to reset the start point for whole-run
      * vs since-last-nudge snapshots: though that would require
      * changing all usage types to be signed (PR 553707) so we
      * keep everything absolute w/ no reset on nudge.
      */
     /* We hold the malloc lock across the snapshot + reset to prevent
      * new allocs being added in between
      */
     malloc_lock(); /* must be acquired before snapshot_lock */
     nudge_count++;
     snapshot_dump_all();
     print_nudge_header(f_snapshot);
     print_nudge_header(f_callstack);
     if (options.dump) {
         /* For const # snapshots, we want the peak to be the global peak for the
          * whole run, regardless of how many nudges were done.
          * For dump-as-you-go, we want a local peak since the last nudge.
          */
         free_snapshot(&snap_peak);
         memset(&snap_peak, 0, sizeof(snap_peak));
     }
     if (options.staleness)
         print_nudge_header(f_staleness);

     /* Print the nudge index information, i.e., for each nudge, print the file
      * position in snapshot.log that marks the begining of new data after nudge
      * data was dumped.  This makes nudge handling efficient as opposed to
      * building the index during the post process stage which would require a
      * full pass of the snapshot.log file which can be large.  The same applies
      * to staleness.log.  Also specify whether snapshots are fixed in number or
      * variable (-dump).  PR 502468.
      */
     snapshot_fpos = dr_file_tell(f_snapshot);
     if (options.staleness)
         staleness_fpos = dr_file_tell(f_staleness);
     ASSERT(snapshot_fpos >= 0 && staleness_fpos >= 0, "bad log file location");
     dr_fprintf(f_nudge, "%d,%"INT64_FORMAT"u,%"INT64_FORMAT"u\n",
                nudge_count, snapshot_fpos, staleness_fpos);
     malloc_unlock();
     check_reachability(false/*!at_exit*/);
     print_nudge_header(f_global);
     NOTIFY("Received nudge"NL, logsubdir);
 }

 static void
 event_module_load(void *drcontext, const module_data_t *info, bool loaded)
 {
     callstack_module_load(drcontext, info, loaded);
     alloc_module_load(drcontext, info, loaded);
 }

 static void
 event_module_unload(void *drcontext, const module_data_t *info)
 {
     callstack_module_unload(drcontext, info);
     alloc_module_unload(drcontext, info);
 }

 static void
 event_fragment_delete(void *drcontext, void *tag)
 {
     instrument_fragment_delete(drcontext, tag);
     alloc_fragment_delete(drcontext, tag);
 }

 static void
 event_thread_init(void *drcontext)
 {
     uint which_thread = atomic_add32_return_sum((volatile int *)&num_threads, 1) - 1;
     file_t f;
     tls_heapstat_t *pt = (tls_heapstat_t *)
         thread_alloc(drcontext, sizeof(*pt), HEAPSTAT_MISC);
     memset(pt, 0, sizeof(*pt));
     drmgr_set_tls_field(drcontext, tls_idx_heapstat, (void *)pt);

     utils_thread_init(drcontext);
     ELOGF(0, f_global, "new thread #%d id=%d\n",
           which_thread, dr_get_thread_id(drcontext));
     if (!options.thread_logs) {
         f = f_global;
     } else {
         /* we're going to dump our data to a per-thread file */
         f = open_logfile("thread", false, which_thread/*tid suffix*/);
         LOGPT(1, PT_GET(drcontext), "thread logfile fd=%d\n", f);
     }
     utils_thread_set_file(drcontext, f);

     pt->errbufsz = MAX_ERROR_INITIAL_LINES + max_callstack_size();
     pt->errbuf = (char *) thread_alloc(drcontext, pt->errbufsz, HEAPSTAT_MISC);

     LOGPT(2, PT_GET(drcontext), "in event_thread_init()\n");
     callstack_thread_init(drcontext);
     if (options.check_leaks || options.staleness)
         shadow_thread_init(drcontext);
     if (options.staleness)
         instrument_thread_init(drcontext);
 }

 static void
 event_thread_exit(void *drcontext)
 {
     tls_heapstat_t *pt = (tls_heapstat_t *)
         drmgr_get_tls_field(drcontext, tls_idx_heapstat);
     LOGPT(2, PT_GET(drcontext), "in event_thread_exit()\n");
     if (options.staleness)
         instrument_thread_init(drcontext);
     callstack_thread_exit(drcontext);
     utils_thread_exit(drcontext);
     thread_free(drcontext, (void *) pt->errbuf, pt->errbufsz, HEAPSTAT_MISC);
     /* with PR 536058 we do have dcontext in exit event so indicate explicitly
      * that we've cleaned up the per-thread data
      */
     drmgr_set_tls_field(drcontext, tls_idx_heapstat, NULL);
     thread_free(drcontext, pt, sizeof(*pt), HEAPSTAT_MISC);
 }

 static bool
 alloc_itercb_exit(malloc_info_t *info, void *iter_data)
 {
     if (options.staleness)
         staleness_free_per_alloc((stale_per_alloc_t *)info->client_data);
     return true;
 }

 static void
 event_exit(void)
 {
     LOGF(2, f_global, "in event_exit\n");

     if (options.time_clock || options.staleness) {
         sideline_exit = true;
         if (options.thread_logs) {
             /* i#297: sideline_run never gets a chance to clean up so we do it */
             ASSERT(sideline_pt != NULL, "sideline per-thread error");
             global_free(sideline_pt, sizeof(*sideline_pt), HEAPSTAT_MISC);
         }
     }
     snapshot_exit();
     if (options.check_leaks) {
         check_reachability(true/*at_exit*/);
         leak_exit();
     }

     if (options.staleness)
         malloc_iterate(alloc_itercb_exit, NULL);

     alloc_exit(); /* must be before deleting alloc_stack_table */
     heap_region_exit(); /* must be after alloc_exit */
     LOG(1, "final alloc stack table size: %u bits, %u entries\n",
         alloc_stack_table.table_bits, alloc_stack_table.entries);
     hashtable_delete(&alloc_stack_table);
 #ifdef CHECK_WITH_MD5
     hashtable_delete(&alloc_md5_table);
 #endif
     callstack_exit();
     if (options.check_leaks || options.staleness) {
         instrument_exit();
         shadow_exit();
     }
     free_shared_code();
     utils_exit();

 #ifdef USE_DRSYMS
     if (options.use_symcache)
         drsymcache_exit();
 #endif

 #ifdef STATISTICS
     dump_statistics();
 #endif

     if (umbra_exit() != DRMF_SUCCESS)
         ASSERT(false, "fail to finalize umbra");
     if (drsys_exit() != DRMF_SUCCESS)
         ASSERT(false, "drsys failed to exit");
     drmgr_unregister_tls_field(tls_idx_heapstat);
     drwrap_exit();
     drmgr_exit();

     dr_fprintf(f_global, "LOG END\n");
     close_file(f_global);
     dr_fprintf(f_callstack, "LOG END\n");
     close_file(f_callstack);
     dr_fprintf(f_snapshot, "LOG END\n");
     close_file(f_snapshot);
     if (options.staleness) {
         dr_fprintf(f_staleness, "LOG END\n");
         close_file(f_staleness);
     }
     close_file(f_nudge);
 }

 DR_EXPORT void
 dr_init(client_id_t client_id)
 {
     const char *opstr = dr_get_options(client_id);
     alloc_options_t alloc_ops;
     drsys_options_t ops = { sizeof(ops), 0, };
     callstack_options_t callstack_ops = { sizeof(callstack_ops), 0 };
     drmgr_priority_t priority = {sizeof(priority), "drheapstat", NULL, NULL, 1000};

     dr_set_client_name("Dr. Heapstat", "http://drmemory.org/issues");

     ASSERT(opstr != NULL, "error obtaining option string");
     drheap_options_init(opstr);

     drmgr_init(); /* must be before utils_init and any other tls/cls uses */
     tls_idx_heapstat = drmgr_register_tls_field();
     ASSERT(tls_idx_heapstat > -1, "unable to reserve TLS slot");

     drwrap_init();
     utils_init();
     if (drsys_init(client_id, &ops) != DRMF_SUCCESS)
         ASSERT(false, "drsys failed to init");
     if (umbra_init(client_id) != DRMF_SUCCESS)
         ASSERT(false, "Umbra failed to init");

     /* now that we know whether -quiet, print basic info */
     NOTIFY("Dr. Heapstat version %s"NL, VERSION_STRING);
     NOTIFY("options are \"%s\""NL, opstr);

     create_global_logfile();
     LOG(0, "options are \"%s\"\n", opstr);

     dr_register_exit_event(event_exit);
     drmgr_register_thread_init_event(event_thread_init);
     drmgr_register_thread_exit_event(event_thread_exit);

     if (!drmgr_register_bb_instrumentation_ex_event
         (event_bb_app2app, event_bb_analysis, event_bb_insert,
          event_bb_instru2instru, &priority))
         ASSERT(false, "drmgr registration failed");
     drmgr_register_module_load_event(event_module_load);
     drmgr_register_module_unload_event(event_module_unload);
 #ifdef UNIX
     dr_register_fork_init_event(event_fork);
 #endif
     if (ZERO_STACK()) {
 #ifdef UNIX
         drmgr_register_signal_event(event_signal);
 #else
         drmgr_register_exception_event(event_exception);
 #endif
     }

     dr_register_filter_syscall_event(event_filter_syscall);
     drmgr_register_pre_syscall_event(event_pre_syscall);
     drmgr_register_post_syscall_event(event_post_syscall);
     /* simplest to filter all for pre-syscall-arg access: else we'd
      * need to ask alloc for all that it cares about.
      */
     if (drsys_filter_all_syscalls() != DRMF_SUCCESS)
         ASSERT(false, "drsys_filter_all_syscalls should never fail");

     dr_register_nudge_event(event_nudge, client_id);
     if (options.staleness)
         drmgr_register_restore_state_ex_event(event_restore_state);
     /* DR complains if we have xl8 fields and faults w/o restore-state events */
     else if (ZERO_STACK())
         drmgr_register_restore_state_ex_event(event_restore_state_nop);
     if (options.staleness)
         dr_register_delete_event(event_fragment_delete);

     /* make it easy to tell, by looking at log file, which client executed */
     dr_log(NULL, LOG_ALL, 1, "client = Dr. Heapstat version %s\n", VERSION_STRING);

 #ifdef USE_DRSYMS
     if (options.use_symcache) {
         if (!option_specified.symcache_dir) {
             dr_snprintf(options.symcache_dir, BUFFER_SIZE_ELEMENTS(options.symcache_dir),
                         "%s/symcache", options.logdir);
             NULL_TERMINATE_BUFFER(options.symcache_dir);
         }
         drsymcache_init(client_id, options.symcache_dir, 0);
     }
 #endif

     snapshot_init();

     callstack_ops.global_max_frames = options.callstack_max_frames;
     callstack_ops.stack_swap_threshold = 0x10000;
     /* default flags: but if we have apps w/ DGC we may
      * want to expose some flags as options
      */
     callstack_ops.fp_flags = 0;
     /* scan forward 1 page: good compromise bet perf (scanning
      * can be the bottleneck) and good callstacks
      */
     callstack_ops.fp_scan_sz = PAGE_SIZE;
     /* XXX i#926: symbolize and suppress leaks online (and then
      * use options.callstack_style here)
      */
     callstack_ops.print_flags = PRINT_FOR_POSTPROCESS;
     callstack_ops.ignore_xbp = IF_WINDOWS_ELSE(is_in_seh_unwind, NULL);
     /* XXX: may need better callstack heuristics w/o shadow info
      * if user turns off stack zeroing from -leaks_only
      */
     callstack_ops.old_retaddrs_zeroed = options.zero_retaddr;
     callstack_ops.tool_lib_ignore = IF_WINDOWS_ELSE("drheapstatlib.dll",
                                                     "libdrheapstatlib.so*");
     callstack_ops.bad_fp_list = options.callstack_bad_fp_list;
     callstack_init(&callstack_ops);

     heap_region_init(client_heap_add, client_heap_remove);
     /* We keep callstacks around forever and only free when we delete
      * the alloc_stack_table, so no refcounts
      */

     /* must be before heap_walk() and alloc_init() */
     if (options.check_leaks || options.staleness)
         shadow_init();

     hashtable_init_ex(&alloc_stack_table, ASTACK_TABLE_HASH_BITS, HASH_CUSTOM,
                       false/*!str_dup*/, false/* !synch; + higher-level synch covered
                                                * by malloc_table's lock */,
                       alloc_callstack_free,
 #ifdef USE_MD5
                       (uint (*)(void*)) md5_hash,
                       (bool (*)(void*, void*)) md5_digests_equal
 #else
                       (uint (*)(void*)) crc32_whole_and_half_hash,
                       (bool (*)(void*, void*)) crc32_whole_and_half_equal
 #endif
                       );
 #ifdef CHECK_WITH_MD5
     hashtable_init_ex(&alloc_md5_table, ASTACK_TABLE_HASH_BITS, HASH_CUSTOM,
                       false/*!str_dup*/, false/* !synch; + higher-level synch covered
                                                * by malloc_table's lock */,
                       NULL/*points at md5 data stored in alloc_stack_table*/,
                       (uint (*)(void*)) md5_hash,
                       (bool (*)(void*, void*)) md5_digests_equal);
 #endif

     memset(&alloc_ops, 0, sizeof(alloc_ops));
     alloc_ops.track_allocs = true;
     alloc_ops.track_heap = true;
     alloc_ops.redzone_size = 0; /* no redzone */
     alloc_ops.size_in_redzone = false;
     alloc_ops.record_allocs = true;
     alloc_ops.get_padded_size = true;
     alloc_ops.cache_postcall = false;
     alloc_ops.intercept_operators = false;
     alloc_ops.conservative = options.conservative;
     alloc_ops.global_lock = true; /* we want to serialize w/ our snapshots */
     alloc_ops.replace_malloc = true;
     alloc_ops.use_symcache = options.use_symcache;
     alloc_init(&alloc_ops, sizeof(alloc_ops));

     /* must be after heap_region_init and snapshot_init */
     heap_walk();

     if (options.check_leaks || options.staleness)
         instrument_init();

     create_shared_code();

     if (options.time_clock)
         timestamp_last_snapshot = dr_get_milliseconds();
     if (options.time_clock || options.staleness) {
         if (!dr_create_client_thread(sideline_run, NULL)) {
             ASSERT(false, "unable to create thread");
         }
     }

     if (options.check_leaks) {
         leak_init(false/*no defined info*/,
                   options.check_leaks_on_destroy,
                   options.midchunk_new_ok,
                   options.midchunk_inheritance_ok,
                   options.midchunk_string_ok,
                   options.midchunk_size_ok,
                   options.show_reachable,
                   IF_WINDOWS_(options.check_encoded_pointers)
                   NULL, NULL, NULL);
     }
 }