core/dynamo.c - external/github.com/DynamoRIO/dynamorio - Git at Google

 /* **********************************************************
  * Copyright (c) 2010-2022 Google, Inc.  All rights reserved.
  * Copyright (c) 2000-2010 VMware, Inc.  All rights reserved.
  * **********************************************************/

 /*
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * * Redistributions of source code must retain the above copyright notice,
  *   this list of conditions and the following disclaimer.
  *
  * * Redistributions in binary form must reproduce the above copyright notice,
  *   this list of conditions and the following disclaimer in the documentation
  *   and/or other materials provided with the distribution.
  *
  * * Neither the name of VMware, Inc. nor the names of its contributors may be
  *   used to endorse or promote products derived from this software without
  *   specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */

 /* Copyright (c) 2003-2007 Determina Corp. */
 /* Copyright (c) 2001-2003 Massachusetts Institute of Technology */
 /* Copyright (c) 2000-2001 Hewlett-Packard Company */

 /*
  * dynamo.c -- initialization and cleanup routines for DynamoRIO
  */

 #include "globals.h"
 #include "configure_defines.h"
 #include "link.h"
 #include "fragment.h"
 #include "fcache.h"
 #include "emit.h"
 #include "dispatch.h"
 #include "utils.h"
 #include "monitor.h"
 #include "vmareas.h"
 #ifdef SIDELINE
 #    include "sideline.h"
 #endif
 #ifdef PAPI
 #    include "perfctr.h"
 #endif
 #include "instrument.h"
 #include "hotpatch.h"
 #include "moduledb.h"
 #include "module_shared.h"
 #include "synch.h"
 #include "native_exec.h"
 #include "jit_opt.h"

 #ifdef ANNOTATIONS
 #    include "annotations.h"
 #endif

 #ifdef WINDOWS
 /* for close handle, duplicate handle, free memory and constants associated with them
  */
 /* also for nt_terminate_process_for_app() */
 #    include "ntdll.h"
 #    include "nudge.h" /* to get generic_nudge_target() address for an assert */
 #endif

 #ifdef RCT_IND_BRANCH
 #    include "rct.h"
 #endif

 #include "perscache.h"

 #ifdef VMX86_SERVER
 #    include "vmkuw.h"
 #endif

 #ifndef STANDALONE_UNIT_TEST
 #    ifdef __AVX512F__
 #        error "DynamoRIO core should run without AVX-512 instructions to remain \
 portable and to avoid frequency scaling."
 #    endif
 #endif

 /* global thread-shared variables */
 bool dynamo_initialized = false;
 static bool dynamo_options_initialized = false;
 bool dynamo_heap_initialized = false;
 bool dynamo_started = false;
 bool automatic_startup = false;
 bool control_all_threads = false;
 /* On Windows we can't really tell attach apart from our default late
  * injection, and we do see early threads in place which is the point of
  * this flag: so we always set it.
  */
 bool dynamo_control_via_attach = IF_WINDOWS_ELSE(true, false);
 #ifdef WINDOWS
 bool dr_early_injected = false;
 int dr_early_injected_location = INJECT_LOCATION_Invalid;
 bool dr_earliest_injected = false;
 static void *dr_earliest_inject_args;

 /* should be set if we are controlling the primary thread, either by
  * injecting initially (!dr_injected_secondary_thread), or by retaking
  * over (dr_late_injected_primary_thread).  Used only for debugging
  * purposes, yet can't rely on !dr_injected_secondary_thread very
  * early in the process
  */
 bool dr_injected_primary_thread = false;
 bool dr_injected_secondary_thread = false;

 /* should be set once we retakeover the primary thread for -inject_primary */
 bool dr_late_injected_primary_thread = false;
 #endif /* WINDOWS */
 /* flags to indicate when DR is being initialized / exited using the API */
 bool dr_api_entry = false;
 bool dr_api_exit = false;
 #ifdef RETURN_AFTER_CALL
 bool dr_preinjected = false;
 #endif /* RETURN_AFTER_CALL */
 #ifdef UNIX
 static bool dynamo_exiting = false;
 #endif
 bool dynamo_exited = false;
 bool dynamo_exited_all_other_threads = false;
 bool dynamo_exited_and_cleaned = false;
 #ifdef DEBUG
 bool dynamo_exited_log_and_stats = false;
 #endif
 /* Only used in release build to decide whether synch is needed, justifying
  * its placement in .nspdata.  If we use it for more we should protect it.
  */
 DECLARE_NEVERPROT_VAR(bool dynamo_all_threads_synched, false);
 bool dynamo_resetting = false;
 bool standalone_library = false;
 static int standalone_init_count;
 #ifdef UNIX
 bool post_execve = false;
 #endif
 /* initial stack so we don't have to use app's */
 byte *d_r_initstack;

 event_t dr_app_started;
 event_t dr_attach_finished;

 #ifdef WINDOWS
 /* PR203701: separate stack for error reporting when the dstack is exhausted */
 #    define EXCEPTION_STACK_SIZE (2 * PAGE_SIZE)
 DECLARE_NEVERPROT_VAR(byte *exception_stack, NULL);
 #endif

 /*******************************************************/
 /* separate segment of Non-Self-Protected data to avoid data section
  * protection issues -- we need to write to these vars in bootstrapping
  * spots where we cannot unprotect first
  */
 START_DATA_SECTION(NEVER_PROTECTED_SECTION, "w");

 /* spinlock used in assembly trampolines when we can't spare registers for more */
 mutex_t initstack_mutex IF_AARCH64(__attribute__((aligned(8))))
     VAR_IN_SECTION(NEVER_PROTECTED_SECTION) = INIT_SPINLOCK_FREE(initstack_mutex);
 byte *initstack_app_xsp VAR_IN_SECTION(NEVER_PROTECTED_SECTION) = 0;
 /* keeps track of how many threads are in cleanup_and_terminate */
 volatile int exiting_thread_count VAR_IN_SECTION(NEVER_PROTECTED_SECTION) = 0;
 /* Tracks newly created threads not yet on the all_threads list. */
 volatile int uninit_thread_count VAR_IN_SECTION(NEVER_PROTECTED_SECTION) = 0;

 /* This is unprotected to allow stats to be written while the data
  * segment is still protected (right now the only ones are selfmod stats)
  */
 static dr_statistics_t nonshared_stats VAR_IN_SECTION(NEVER_PROTECTED_SECTION) = {
     { 0 },
 };

 /* Each lock protects its corresponding datasec_start, datasec_end, and
  * datasec_writable variables.
  */
 static mutex_t
     datasec_lock[DATASEC_NUM] VAR_IN_SECTION(NEVER_PROTECTED_SECTION) = { { 0 } };

 /* back to normal section */
 END_DATA_SECTION()
 /*******************************************************/

 /* Like a recursive lock: 0==readonly, 1+=writable.
  * This would be a simple array, but we need each in its own protected
  * section, as this could be exploited.
  */
 const uint datasec_writable_neverprot = 1; /* always writable */
 uint datasec_writable_rareprot = 1;
 DECLARE_FREQPROT_VAR(uint datasec_writable_freqprot, 1);
 DECLARE_CXTSWPROT_VAR(uint datasec_writable_cxtswprot, 1);

 static app_pc datasec_start[DATASEC_NUM];
 static app_pc datasec_end[DATASEC_NUM];

 const uint DATASEC_SELFPROT[] = {
     0,
     SELFPROT_DATA_RARE,
     SELFPROT_DATA_FREQ,
     SELFPROT_DATA_CXTSW,
 };

 const char *const DATASEC_NAMES[] = {
     NEVER_PROTECTED_SECTION,
     RARELY_PROTECTED_SECTION,
     FREQ_PROTECTED_SECTION,
     CXTSW_PROTECTED_SECTION,
 };

 /* kept in unprotected heap to avoid issues w/ data segment being RO */
 typedef struct _protect_info_t {
     /* FIXME: this needs to be a recursive lock to handle signals
      * and exceptions!
      */
     mutex_t lock;
     int num_threads_unprot; /* # threads in DR code */
     int num_threads_suspended;
 } protect_info_t;
 static protect_info_t *protect_info;

 static void
 data_section_init(void);
 static void
 data_section_exit(void);

 #ifdef DEBUG /*************************/

 #    include <time.h>

 /* FIXME: not all dynamo_options references are #ifdef DEBUG
  * are we trying to hardcode the options for a release build?
  */
 #    ifdef UNIX
 /* linux include files for mmap stuff*/
 #        include <sys/ipc.h>
 #        include <sys/types.h>
 #        include <unistd.h>
 #    endif

 static uint starttime;

 file_t main_logfile = INVALID_FILE;

 #endif /* DEBUG ****************************/

 dr_statistics_t *d_r_stats = NULL;

 DECLARE_FREQPROT_VAR(static int num_known_threads, 0);
 #ifdef UNIX
 /* i#237/PR 498284: vfork threads that execve need to be separately delay-freed */
 DECLARE_FREQPROT_VAR(int num_execve_threads, 0);
 #endif
 DECLARE_FREQPROT_VAR(static uint threads_ever_count, 0);

 /* FIXME : not static so os.c can hand walk it for dump core */
 /* FIXME: use new generic_table_t and generic_hash_* routines */
 thread_record_t **all_threads; /* ALL_THREADS_HASH_BITS-bit addressed hash table */

 /* these locks are used often enough that we put them in .cspdata: */

 /* not static so can be referenced in win32/os.c for SuspendThread handling,
  * FIXME : is almost completely redundant in usage with thread_initexit_lock
  * maybe replace this lock with thread_initexit_lock? */
 DECLARE_CXTSWPROT_VAR(mutex_t all_threads_lock, INIT_LOCK_FREE(all_threads_lock));
 /* used for synch to prevent thread creation/deletion in critical periods
  * due to its use for flushing, this lock cannot be held while couldbelinking!
  */
 DECLARE_CXTSWPROT_VAR(mutex_t thread_initexit_lock, INIT_LOCK_FREE(thread_initexit_lock));

 /* recursive to handle signals/exceptions while in DR code */
 DECLARE_CXTSWPROT_VAR(static recursive_lock_t thread_in_DR_exclusion,
                       INIT_RECURSIVE_LOCK(thread_in_DR_exclusion));

 static thread_synch_state_t
 exit_synch_state(void);

 static void
 synch_with_threads_at_exit(thread_synch_state_t synch_res, bool pre_exit);

 static void
 delete_dynamo_context(dcontext_t *dcontext, bool free_stack);

 /****************************************************************************/
 #ifdef DEBUG

 static const char *
 main_logfile_name(void)
 {
     return get_app_name_for_path();
 }

 static const char *
 thread_logfile_name(void)
 {
     return "log";
 }

 #endif /* DEBUG */
 /****************************************************************************/

 static void
 statistics_pre_init(void)
 {
     /* until it's set up for real, point at static var
      * really only logmask and loglevel are meaningful, so be careful!
      * statistics_init and create_log_directory are the only routines that
      * use stats before it's set up for real, currently
      */
     /* The indirection here is left over from when we used to allow alternative
      * locations for stats (namely shared memory for the old MIT gui). */
     d_r_stats = &nonshared_stats;
     d_r_stats->process_id = get_process_id();
     strncpy(d_r_stats->process_name, get_application_name(), MAXIMUM_PATH);
     d_r_stats->process_name[MAXIMUM_PATH - 1] = '\0';
     ASSERT(strlen(d_r_stats->process_name) > 0);
     d_r_stats->num_stats = 0;
 }

 static void
 statistics_init(void)
 {
     /* should have called statistics_pre_init() first */
     ASSERT(d_r_stats == &nonshared_stats);
     ASSERT(d_r_stats->num_stats == 0);
 #ifndef DEBUG
     if (!DYNAMO_OPTION(global_rstats)) {
         /* references to stat values should return 0 (static var) */
         return;
     }
 #endif
     d_r_stats->num_stats = 0
 #ifdef DEBUG
 #    define STATS_DEF(desc, name) +1
 #else
 #    define RSTATS_DEF(desc, name) +1
 #endif
 #include "statsx.h"
 #undef STATS_DEF
 #undef RSTATS_DEF
         ;
     /* We inline the stat description to make it easy for external processes
      * to view our stats: they don't have to chase pointers, and we could put
      * this in shared memory easily.  However, we do waste some memory, but
      * not much in release build.
      */
 #ifdef DEBUG
 #    define STATS_DEF(desc, statname)                                   \
         strncpy(d_r_stats->statname##_pair.name, desc,                  \
                 BUFFER_SIZE_ELEMENTS(d_r_stats->statname##_pair.name)); \
         NULL_TERMINATE_BUFFER(d_r_stats->statname##_pair.name);
 #else
 #    define RSTATS_DEF(desc, statname)                                  \
         strncpy(d_r_stats->statname##_pair.name, desc,                  \
                 BUFFER_SIZE_ELEMENTS(d_r_stats->statname##_pair.name)); \
         NULL_TERMINATE_BUFFER(d_r_stats->statname##_pair.name);
 #endif
 #include "statsx.h"
 #undef STATS_DEF
 #undef RSTATS_DEF
 }

 static void
 statistics_exit(void)
 {
     if (doing_detach)
         memset(d_r_stats, 0, sizeof(*d_r_stats)); /* for possible re-attach */
     d_r_stats = NULL;
 }

 dr_statistics_t *
 get_dr_stats(void)
 {
     return d_r_stats;
 }

 /* initialize per-process dynamo state; this must be called before any
  * threads are created and before any other API calls are made;
  * returns zero on success, non-zero on failure
  */
 DYNAMORIO_EXPORT int
 dynamorio_app_init(void)
 {
     dynamorio_app_init_part_one_options();
     return dynamorio_app_init_part_two_finalize();
 }

 void
 dynamorio_app_init_part_one_options(void)
 {
     if (dynamo_initialized || dynamo_options_initialized) {
         if (standalone_library) {
             REPORT_FATAL_ERROR_AND_EXIT(STANDALONE_ALREADY, 2, get_application_name(),
                                         get_application_pid());
         }
     } else /* we do enter if nullcalls is on */ {

 #ifdef UNIX
         os_page_size_init((const char **)our_environ, is_our_environ_followed_by_auxv());
 #endif
 #ifdef WINDOWS
         /* MUST do this before making any system calls */
         syscalls_init();
 #endif
         /* avoid time() for libc independence */
         DODEBUG(starttime = query_time_seconds(););

 #ifdef UNIX
         if (getenv(DYNAMORIO_VAR_EXECVE) != NULL) {
             post_execve = true;
 #    ifdef VMX86_SERVER
             /* PR 458917: our gdt slot was not cleared on exec so we need to
              * clear it now to ensure we don't leak it and eventually run out of
              * slots.  We could alternatively call os_tls_exit() prior to
              * execve, since syscalls use thread-private fcache_enter, but
              * complex to recover from execve failure, so instead we pass which
              * TLS index we had.
              */
             os_tls_pre_init(atoi(getenv(DYNAMORIO_VAR_EXECVE)));
 #    endif
             /* important to remove it, don't want to propagate to forked children, etc. */
             /* i#909: unsetenv is unsafe as it messes up auxv access, so we disable */
             disable_env(DYNAMORIO_VAR_EXECVE);
             /* check that it's gone: we've had problems with unsetenv */
             ASSERT(getenv(DYNAMORIO_VAR_EXECVE) == NULL);
         } else
             post_execve = false;
 #endif

             /* default non-zero dynamo settings (options structure is
              * initialized to 0 automatically)
              */
 #ifdef DEBUG
 #    ifndef INTERNAL
         nonshared_stats.logmask = LOG_ALL_RELEASE;
 #    else
         nonshared_stats.logmask = LOG_ALL;
 #    endif
         statistics_pre_init();
 #endif

         d_r_config_init();
         options_init();
 #ifdef WINDOWS
         syscalls_init_options_read(); /* must be called after options_init
                                        * but before init_syscall_trampolines */
 #endif
         utils_init();
         data_section_init();

 #ifdef DEBUG
         /* decision: nullcalls WILL create a dynamorio.log file and
          * fill it with perfctr stats!
          */
         if (d_r_stats->loglevel > 0) {
             main_logfile = open_log_file(main_logfile_name(), NULL, 0);
             LOG(GLOBAL, LOG_TOP, 1, "global log file fd=%d\n", main_logfile);
         } else {
             /* loglevel 0 means we don't create a log file!
              * if the loglevel is later raised, too bad!  it all goes to stderr!
              * N.B.: when checking for no logdir, we check for empty string or
              * first char '<'!
              */
             strncpy(d_r_stats->logdir, "<none (loglevel was 0 on startup)>",
                     MAXIMUM_PATH - 1);
             d_r_stats->logdir[MAXIMUM_PATH - 1] = '\0'; /* if max no null */
             main_logfile = INVALID_FILE;
         }

 #    ifdef PAPI
         /* setup hardware performance counting */
         hardware_perfctr_init();
 #    endif

         DOLOG(1, LOG_TOP, { print_version_and_app_info(GLOBAL); });

         /* now exit if nullcalls, now that perfctrs are set up */
         if (INTERNAL_OPTION(nullcalls)) {
             return;
         }

         LOG(GLOBAL, LOG_TOP, 1, PRODUCT_NAME "'s stack size: %d Kb\n",
             DYNAMORIO_STACK_SIZE / 1024);
 #endif /* !DEBUG */

         /* set up exported statistics struct */
 #ifndef DEBUG
         statistics_pre_init();
 #endif
         statistics_init();

         dynamo_options_initialized = true;
     }
 }

 int
 dynamorio_app_init_part_two_finalize(void)
 {
     if (!dynamo_options_initialized) {
         /* Part one was never called. */
         return FAILURE;
     } else if (dynamo_initialized) {
         if (standalone_library) {
             REPORT_FATAL_ERROR_AND_EXIT(STANDALONE_ALREADY, 2, get_application_name(),
                                         get_application_pid());
         }
         /* Nop. */
     } else if (INTERNAL_OPTION(nullcalls)) {
         print_file(main_logfile, "** nullcalls is set, NOT taking over execution **\n\n");
         return SUCCESS;
     } else {
 #ifdef VMX86_SERVER
         /* Must be before {vmm,d_r}_heap_init() */
         vmk_init_lib();
 #endif

         /* initialize components (CAUTION: order is important here) */
         vmm_heap_init(); /* must be called even if not using vmm heap */
         /* PR 200207: load the client lib before callback_interception_init
          * since the client library load would hit our own hooks (xref hotpatch
          * cases about that) -- though -private_loader removes that issue.
          */
         instrument_load_client_libs();
         d_r_heap_init();
         dynamo_heap_initialized = true;

         /* The process start event should be done after d_r_os_init() but before
          * process_control_int() because the former initializes event logging
          * and the latter can kill the process if a violation occurs.
          */
         SYSLOG(SYSLOG_INFORMATION, INFO_PROCESS_START_CLIENT, 2, get_application_name(),
                get_application_pid());

 #ifdef PROCESS_CONTROL
         if (IS_PROCESS_CONTROL_ON()) /* Case 8594. */
             process_control_init();
 #endif

 #ifdef WINDOWS
         /* Now that DR is set up, perform any final clean-up, before
          * we do our address space scans.
          */
         if (dr_earliest_injected)
             earliest_inject_cleanup(dr_earliest_inject_args);
 #endif

         dynamo_vm_areas_init();
         d_r_decode_init();
         proc_init();
         modules_init(); /* before vm_areas_init() */
         d_r_os_init();
         config_heap_init(); /* after heap_init */

         /* Setup for handling faults in loader_init() */
         /* initial stack so we don't have to use app's
          * N.B.: we never de-allocate d_r_initstack (see comments in app_exit)
          */
         d_r_initstack = (byte *)stack_alloc(DYNAMORIO_STACK_SIZE, NULL);
         LOG(GLOBAL, LOG_SYNCH, 2, "d_r_initstack is " PFX "-" PFX "\n",
             d_r_initstack - DYNAMORIO_STACK_SIZE, d_r_initstack);

 #ifdef WINDOWS
         /* PR203701: separate stack for error reporting when the
          * dstack is exhausted
          */
         exception_stack = (byte *)stack_alloc(EXCEPTION_STACK_SIZE, NULL);
 #endif
 #ifdef WINDOWS
         if (!INTERNAL_OPTION(noasynch)) {
             /* We split the hooks up: first we put in just Ki* to catch
              * exceptions in client init routines (PR 200207), but we don't want
              * syscall hooks so client init can scan syscalls.
              * Xref PR 216934 where this was originally down below 1st thread init,
              * before we had GLOBAL_DCONTEXT.
              */
             callback_interception_init_start();
         }
 #endif /* WINDOWS */

         /* Set up any private-loader-related data we need before generating any
          * code, such as the private PEB on Windows.
          */
         loader_init_prologue();

         d_r_arch_init();
         synch_init();

 #ifdef KSTATS
         kstat_init();
 #endif
         d_r_monitor_init();
         fcache_init();
         d_r_link_init();
         fragment_init();
         moduledb_init();    /* before vm_areas_init, after heap_init */
         perscache_init();   /* before vm_areas_init */
         native_exec_init(); /* before vm_areas_init, after arch_init */

         if (!DYNAMO_OPTION(thin_client)) {
 #ifdef HOT_PATCHING_INTERFACE
             /* must init hotp before vm_areas_init() calls find_executable_vm_areas() */
             if (DYNAMO_OPTION(hot_patching))
                 hotp_init();
 #endif
         }

 #ifdef INTERNAL
         {
             char initial_options[MAX_OPTIONS_STRING];
             get_dynamo_options_string(&dynamo_options, initial_options,
                                       sizeof(initial_options), true);
             SYSLOG_INTERNAL_INFO("Initial options = %s", initial_options);
             DOLOG(1, LOG_TOP, {
                 get_pcache_dynamo_options_string(&dynamo_options, initial_options,
                                                  sizeof(initial_options),
                                                  OP_PCACHE_LOCAL);
                 LOG(GLOBAL, LOG_TOP, 1, "Initial pcache-affecting options = %s\n",
                     initial_options);
             });
         }
 #endif /* INTERNAL */

         LOG(GLOBAL, LOG_TOP, 1, "\n");

         /* initialize thread hashtable */
         /* Note: for thin_client, this isn't needed if it is only going to
          * look for spawned processes; however, if we plan to promote from
          * thin_client to hotp_only mode (highly likely), this would be needed.
          * For now, leave it in there unless thin_client footprint becomes an
          * issue.
          */
         int size = HASHTABLE_SIZE(ALL_THREADS_HASH_BITS) * sizeof(thread_record_t *);
         all_threads =
             (thread_record_t **)global_heap_alloc(size HEAPACCT(ACCT_THREAD_MGT));
         memset(all_threads, 0, size);
         if (!INTERNAL_OPTION(nop_initial_bblock) IF_WINDOWS(
                 || !check_sole_thread())) /* some other thread is already here! */
             bb_lock_start = true;

 #ifdef SIDELINE
         /* initialize sideline thread after thread table is set up */
         if (dynamo_options.sideline)
             sideline_init();
 #endif

         /* We can't clear this on detach like other vars b/c we need native threads
          * to continue to avoid safe_read_tls_magic() in is_thread_tls_initialized().
          * So we clear it on (re-)init in dynamorio_take_over_threads().
          * From now until then, we avoid races where another thread invokes a
          * safe_read during native signal delivery but we remove DR's handler before
          * it reaches there and it is delivered to the app's handler instead, kind
          * of like i#3535, by re-using the i#3535 mechanism of pointing at the only
          * thread who could possibly have a dcontext.
          * XXX: Should we rename this s/detacher_/singleton_/ or something?
          */
         detacher_tid = IF_UNIX_ELSE(get_sys_thread_id(), INVALID_THREAD_ID);
         /* thread-specific initialization for the first thread we inject in
          * (in a race with injected threads, sometimes it is not the primary thread)
          */
         /* i#117/PR 395156: it'd be nice to have mc here but would
          * require changing start/stop API
          */
         dynamo_thread_init(NULL, NULL, NULL, false);
         /* i#2751: we need TLS to be set up to relocate and call init funcs. */
         loader_init_epilogue(get_thread_private_dcontext());

         /* We move vm_areas_init() below dynamo_thread_init() so we can have
          * two things: 1) a dcontext and 2) a SIGSEGV handler, for TRY/EXCEPT
          * inside vm_areas_init() for PR 361594's probes and for d_r_safe_read().
          * This means vm_areas_thread_init() runs before vm_areas_init().
          */
         if (!DYNAMO_OPTION(thin_client)) {
             vm_areas_init();
 #ifdef RCT_IND_BRANCH
             /* relies on is_in_dynamo_dll() which needs vm_areas_init */
             rct_init();
 #endif
         } else {
             /* This is needed to handle exceptions in thin_client mode, mostly
              * internal ones, but can be app ones too. */
             dynamo_vm_areas_lock();
             find_dynamo_library_vm_areas();
             dynamo_vm_areas_unlock();
         }

 #ifdef ANNOTATIONS
         annotation_init();
 #endif
         jitopt_init();

         dr_attach_finished = create_broadcast_event();

         /* New client threads rely on dr_app_started being initialized, so do
          * that before initializing clients.
          */
         dr_app_started = create_broadcast_event();
         /* client last, in case it depends on other inits: must be after
          * dynamo_thread_init so the client can use a dcontext (PR 216936).
          * Note that we *load* the client library before installing our hooks,
          * but call the client's init routine afterward so that we correctly
          * report crashes (PR 200207).
          * Note: DllMain in client libraries can crash and we still won't
          *       report; better document that client libraries shouldn't have
          *       DllMain.
          */
         instrument_init();
         /* To give clients a chance to process pcaches as we load them, we
          * delay the loading until we've initialized the clients.
          */
         vm_area_delay_load_coarse_units();

 #ifdef WINDOWS
         if (!INTERNAL_OPTION(noasynch))
             callback_interception_init_finish(); /* split for PR 200207: see above */
 #endif

         if (SELF_PROTECT_ON_CXT_SWITCH) {
             protect_info = (protect_info_t *)global_unprotected_heap_alloc(
                 sizeof(protect_info_t) HEAPACCT(ACCT_OTHER));
             ASSIGN_INIT_LOCK_FREE(protect_info->lock, protect_info);
             protect_info->num_threads_unprot = 0; /* ENTERING_DR() below will inc to 1 */
             protect_info->num_threads_suspended = 0;
             if (INTERNAL_OPTION(single_privileged_thread)) {
                 /* FIXME: thread_initexit_lock must be a recursive lock! */
                 ASSERT_NOT_IMPLEMENTED(false);
                 /* grab the lock now -- the thread that is in dynamo must be holding
                  * the lock, and we are the initial thread in dynamo!
                  */
                 d_r_mutex_lock(&thread_initexit_lock);
             }
             /* ENTERING_DR will increment, so decrement first
              * FIXME: waste of protection change since will nop-unprotect!
              */
             if (TEST(SELFPROT_DATA_CXTSW, DYNAMO_OPTION(protect_mask)))
                 datasec_writable_cxtswprot = 0;
             /* FIXME case 8073: remove once freqprot not every cxt sw */
             if (TEST(SELFPROT_DATA_FREQ, DYNAMO_OPTION(protect_mask)))
                 datasec_writable_freqprot = 0;
         }
         /* this thread is now entering DR */
         ENTERING_DR();

 #ifdef WINDOWS
         if (DYNAMO_OPTION(early_inject)) {
             /* AFTER callback_interception_init and self protect init and
              * ENTERING_DR() */
             early_inject_init();
         }
 #endif
     }

     dynamo_initialized = true;

     /* Protect .data, assuming all vars there have been initialized. */
     SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);

     /* internal-only options for testing run-once (case 3990) */
     if (INTERNAL_OPTION(unsafe_crash_process)) {
         SYSLOG_INTERNAL_ERROR("Crashing the process deliberately!");
         *((int *)PTR_UINT_MINUS_1) = 0;
     }
     if (INTERNAL_OPTION(unsafe_hang_process)) {
         event_t never_signaled = create_event();
         SYSLOG_INTERNAL_ERROR("Hanging the process deliberately!");
         wait_for_event(never_signaled, 0);
         destroy_event(never_signaled);
     }

     return SUCCESS;
 }

 #ifdef UNIX
 void
 dynamorio_fork_init(dcontext_t *dcontext)
 {
     /* on a fork we want to re-initialize some data structures, especially
      * log files, which we want a separate directory for
      */
     thread_record_t **threads;
     int i, num_threads;
 #    ifdef DEBUG
     char parent_logdir[MAXIMUM_PATH];
 #    endif

     /* re-cache app name, etc. that are using parent pid before we
      * create log dirs (xref i#189/PR 452168)
      */
     os_fork_init(dcontext);

     /* sanity check, plus need to set this for statistics_init:
      * even if parent did an execve, env var should be reset by now
      */
     post_execve = (getenv(DYNAMORIO_VAR_EXECVE) != NULL);
     ASSERT(!post_execve);

 #    ifdef DEBUG
     /* copy d_r_stats->logdir
      * d_r_stats->logdir is static, so current copy is fine, don't need
      * frozen copy
      */
     strncpy(parent_logdir, d_r_stats->logdir, MAXIMUM_PATH - 1);
     d_r_stats->logdir[MAXIMUM_PATH - 1] = '\0'; /* if max no null */
 #    endif

     if (get_log_dir(PROCESS_DIR, NULL, NULL)) {
         /* we want brand new log dir  */
         enable_new_log_dir();
         create_log_dir(PROCESS_DIR);
     }

 #    ifdef DEBUG
     /* just like dynamorio_app_init, create main_logfile before stats */
     if (d_r_stats->loglevel > 0) {
         /* we want brand new log files.  os_fork_init() closed inherited files. */
         main_logfile = open_log_file(main_logfile_name(), NULL, 0);
         print_file(main_logfile, "%s\n", dynamorio_version_string);
         print_file(main_logfile, "New log file for child %d forked by parent %d\n",
                    d_r_get_thread_id(), get_parent_id());
         print_file(main_logfile, "Parent's log dir: %s\n", parent_logdir);
     }

     d_r_stats->process_id = get_process_id();

     if (d_r_stats->loglevel > 0) {
         /* FIXME: share these few lines of code w/ dynamorio_app_init? */
         LOG(GLOBAL, LOG_TOP, 1, "Running: %s\n", d_r_stats->process_name);
 #        ifndef _WIN32_WCE
         LOG(GLOBAL, LOG_TOP, 1, "DYNAMORIO_OPTIONS: %s\n", d_r_option_string);
 #        endif
     }
 #    endif /* DEBUG */

     vmm_heap_fork_init(dcontext);

     /* must re-hash parent entry in threads table, plus no longer have any
      * other threads (fork -> we're alone in address space), so clear
      * out entire thread table, then add child
      */
     d_r_mutex_lock(&thread_initexit_lock);
     get_list_of_threads_ex(&threads, &num_threads, true /*include execve*/);
     for (i = 0; i < num_threads; i++) {
         if (threads[i] == dcontext->thread_record)
             remove_thread(threads[i]->id);
         else
             dynamo_other_thread_exit(threads[i]);
     }
     d_r_mutex_unlock(&thread_initexit_lock);
     global_heap_free(threads,
                      num_threads * sizeof(thread_record_t *) HEAPACCT(ACCT_THREAD_MGT));

     add_thread(get_process_id(), d_r_get_thread_id(), true /*under dynamo control*/,
                dcontext);

     GLOBAL_STAT(num_threads) = 1;
 #    ifdef DEBUG
     if (d_r_stats->loglevel > 0) {
         /* need a new thread-local logfile */
         dcontext->logfile = open_log_file(thread_logfile_name(), NULL, 0);
         print_file(dcontext->logfile, "%s\n", dynamorio_version_string);
         print_file(dcontext->logfile, "New log file for child %d forked by parent %d\n",
                    d_r_get_thread_id(), get_parent_id());
         LOG(THREAD, LOG_TOP | LOG_THREADS, 1, "THREAD %d (dcontext " PFX ")\n\n",
             d_r_get_thread_id(), dcontext);
     }
 #    endif
     num_threads = 1;

     /* FIXME: maybe should have a callback list for who wants to be notified
      * on a fork -- probably everyone who makes a log file on init.
      */
     fragment_fork_init(dcontext);
     /* this must be called after dynamo_other_thread_exit() above */
     signal_fork_init(dcontext);

     if (CLIENTS_EXIST()) {
         instrument_fork_init(dcontext);
     }
 }
 #endif /* UNIX */

 /* To make DynamoRIO useful as a library for a standalone client
  * application (as opposed to a client library that works with
  * DynamoRIO in executing a target application).  This makes DynamoRIO
  * useful as an IA-32 disassembly library, etc.
  */
 dcontext_t *
 standalone_init(void)
 {
     dcontext_t *dcontext;
     int count = atomic_add_exchange_int(&standalone_init_count, 1);
     if (count > 1 || dynamo_initialized)
         return GLOBAL_DCONTEXT;
     standalone_library = true;
     /* We have release-build stats now so this is not just DEBUG */
     d_r_stats = &nonshared_stats;
     /* No reason to limit heap size when there's no code cache. */
     IF_X64(dynamo_options.reachable_heap = false;)
     dynamo_options.vm_base_near_app = false;
 #if defined(INTERNAL) && defined(DEADLOCK_AVOIDANCE)
     /* avoid issues w/ GLOBAL_DCONTEXT instead of thread dcontext */
     dynamo_options.deadlock_avoidance = false;
 #endif
 #ifdef UNIX
     os_page_size_init((const char **)our_environ, is_our_environ_followed_by_auxv());
 #endif
 #ifdef WINDOWS
     /* MUST do this before making any system calls */
     if (!syscalls_init())
         return NULL; /* typically b/c of unsupported OS version */
 #endif
     d_r_config_init();
     options_init();
     vmm_heap_init();
     d_r_heap_init();
     dynamo_heap_initialized = true;
     dynamo_vm_areas_init();
     d_r_decode_init();
     proc_init();
     d_r_os_init();
     config_heap_init();

 #ifdef STANDALONE_UNIT_TEST
     os_tls_init();
     dcontext = create_new_dynamo_context(true /*initial*/, NULL, NULL);
     set_thread_private_dcontext(dcontext);
     /* sanity check */
     ASSERT(get_thread_private_dcontext() == dcontext);

     heap_thread_init(dcontext);

 #    ifdef DEBUG
     /* XXX: share code w/ main init routine? */
     nonshared_stats.logmask = LOG_ALL;
     options_init();
     if (d_r_stats->loglevel > 0) {
         char initial_options[MAX_OPTIONS_STRING];
         main_logfile = open_log_file(main_logfile_name(), NULL, 0);
         print_file(main_logfile, "%s\n", dynamorio_version_string);
         print_file(main_logfile, "Log file for standalone unit test\n");
         get_dynamo_options_string(&dynamo_options, initial_options,
                                   sizeof(initial_options), true);
         SYSLOG_INTERNAL_INFO("Initial options = %s", initial_options);
         print_file(main_logfile, "\n");
     }
 #    endif /* DEBUG */
 #else
     /* rather than ask the user to call some thread-init routine in
      * every thread, we just use global dcontext everywhere (i#548)
      */
     dcontext = GLOBAL_DCONTEXT;
 #endif

     /* In case standalone_exit() is omitted or there's a crash, we clean up any .1config
      * file right now.  the only loss if that we can't synch options: but that
      * should be less important for standalone.  We disabling synching.
      */
     /* options are never made read-only for standalone */
     dynamo_options.dynamic_options = false;

     dynamo_initialized = true;

     return dcontext;
 }

 void
 standalone_exit(void)
 {
     int count = atomic_add_exchange_int(&standalone_init_count, -1);
     if (count != 0)
         return;
     /* We support re-attach by setting doing_detach. */
     doing_detach = true;
 #ifdef STANDALONE_UNIT_TEST
     dcontext_t *dcontext = get_thread_private_dcontext();
     set_thread_private_dcontext(NULL);
     heap_thread_exit(dcontext);
     delete_dynamo_context(dcontext, true);
     /* We can't call os_tls_exit() b/c we don't have safe_read support for
      * the TLS magic read on Linux.
      */
 #endif
     config_heap_exit();
     os_fast_exit();
     os_slow_exit();
 #if !defined(STANDALONE_UNIT_TEST) || !defined(AARCH64)
     /* XXX: The lock setup is somehow messed up on AArch64.  Disabling cleanup. */
     dynamo_vm_areas_exit();
 #endif
 #ifndef STANDALONE_UNIT_TEST
     /* We have a leak b/c we can't call os_tls_exit().  For now we simplify
      * and leave it alone.
      */
     d_r_heap_exit();
     vmm_heap_exit();
 #endif
     options_exit();
     d_r_config_exit();
     doing_detach = false;
     standalone_library = false;
     dynamo_initialized = false;
     dynamo_options_initialized = false;
     dynamo_heap_initialized = false;
     options_detach();
 }

 /* Perform exit tasks that require full thread data structs, which we have
  * already cleaned up by the time we reach dynamo_shared_exit() for both
  * debug and detach paths.
  */
 void
 dynamo_process_exit_with_thread_info(void)
 {
     perscache_fast_exit(); /* "fast" b/c called in release as well */
 }

 /* shared between app_exit and detach */
 int
 dynamo_shared_exit(thread_record_t *toexit /* must ==cur thread for Linux */
                        _IF_WINDOWS(bool detach_stacked_callbacks))
 {
     DEBUG_DECLARE(uint endtime);
     /* set this now, could already be set */
     dynamo_exited = true;

     /* avoid time() for libc independence */
     DODEBUG(endtime = query_time_seconds(););
     LOG(GLOBAL, LOG_STATS, 1, "\n#### Statistics for entire process:\n");
     LOG(GLOBAL, LOG_STATS, 1, "Total running time: %d seconds\n", endtime - starttime);

 #ifdef PAPI
     hardware_perfctr_exit();
 #endif
 #ifdef DEBUG
 #    if defined(INTERNAL) && defined(X86)
     print_optimization_stats();
 #    endif /* INTERNAL && X86 */
     DOLOG(1, LOG_STATS, { dump_global_stats(false); });
 #endif /* DEBUG */

     if (SELF_PROTECT_ON_CXT_SWITCH) {
         DELETE_LOCK(protect_info->lock);
         global_unprotected_heap_free(protect_info,
                                      sizeof(protect_info_t) HEAPACCT(ACCT_OTHER));
     }

     /* call all component exit routines (CAUTION: order is important here) */

     DELETE_RECURSIVE_LOCK(thread_in_DR_exclusion);
     DOSTATS({
         LOG(GLOBAL, LOG_TOP | LOG_THREADS, 1,
             "fcache_stats_exit: before fragment cleanup\n");
         DOLOG(1, LOG_CACHE, fcache_stats_exit(););
     });
 #ifdef RCT_IND_BRANCH
     if (!DYNAMO_OPTION(thin_client))
         rct_exit();
 #endif
     fragment_exit();
 #ifdef ANNOTATIONS
     annotation_exit();
 #endif
     jitopt_exit();
     /* We tell the client as soon as possible in case it wants to use services from other
      * components.  Must be after fragment_exit() so that the client gets all the
      * fragment_deleted() callbacks (xref PR 228156). FIXME - might be issues with the
      * client trying to use api routines that depend on fragment state.
      */
     instrument_exit_event();
     /* We only need do a second synch-all if there are sideline client threads. */
     if (d_r_get_num_threads() > 1)
         synch_with_threads_at_exit(exit_synch_state(), false /*post-exit*/);
     /* only current thread is alive */
     dynamo_exited_all_other_threads = true;
     fragment_exit_post_sideline();

     /* The dynamo_exited_and_cleaned should be set after the second synch-all.
      * If it is set earlier after the first synch-all, some client thread may
      * have memory leak due to dynamo_thread_exit_pre_client being skipped in
      * dynamo_thread_exit_common called from exiting client threads.
      */
     dynamo_exited_and_cleaned = true;

     destroy_event(dr_app_started);
     destroy_event(dr_attach_finished);

     /* Make thread and process exit calls before we clean up thread data. */
     loader_make_exit_calls(get_thread_private_dcontext());
     /* we want dcontext around for loader_exit() */
     if (get_thread_private_dcontext() != NULL)
         loader_thread_exit(get_thread_private_dcontext());
     /* This will unload client libs, which we delay until after they receive their
      * thread exit calls in loader_thread_exit().
      */
     instrument_exit();
     loader_exit();

     if (toexit != NULL) {
         /* Free detaching thread's dcontext.
          * Restoring the teb fields or segment registers can only be done
          * on the current thread, which must be toexit.
          */
 #ifdef WINDOWS
         /* XXX i#5340: We used to go through dynamo_other_thread_exit() which rewinds
          * the kstats stack as below.  To avoid a kstats assert on this new path we
          * repeat it here but it seems like we shouldn't need it.
          */
         KSTOP_REWIND_DC(get_thread_private_dcontext(), thread_measured);
         KSTART_DC(get_thread_private_dcontext(), thread_measured);
 #endif
         ASSERT(toexit->id == d_r_get_thread_id());
         dynamo_thread_exit();
     }

     if (IF_WINDOWS_ELSE(!detach_stacked_callbacks, true)) {
         /* We don't fully free cur thread until after client exit event (PR 536058) */
         if (thread_lookup(d_r_get_thread_id()) == NULL) {
             LOG(GLOBAL, LOG_TOP | LOG_THREADS, 1,
                 "Current thread never under DynamoRIO control, not exiting it\n");
         } else {
             /* call thread_exit even if !under_dynamo_control, could have
              * been at one time
              */
             /* exit this thread now */
             dynamo_thread_exit();
         }
     }
     /* now that the final thread is exited, free the all_threads memory */
     d_r_mutex_lock(&all_threads_lock);
     global_heap_free(all_threads,
                      HASHTABLE_SIZE(ALL_THREADS_HASH_BITS) *
                          sizeof(thread_record_t *) HEAPACCT(ACCT_THREAD_MGT));
     all_threads = NULL;
     d_r_mutex_unlock(&all_threads_lock);

 #ifdef WINDOWS
     /* for -private_loader we do this here to catch more exit-time crashes */
     if (!INTERNAL_OPTION(noasynch) && INTERNAL_OPTION(private_loader) && !doing_detach)
         callback_interception_unintercept();
     /* callback_interception_exit must be after fragment exit for clients so
      * that fragment_exit->frees fragments->instrument_fragment_deleted->
      * hide_tag_from_fragment->is_intercepted_app_pc won't crash. Xref PR 228156.
      */
     if (!INTERNAL_OPTION(noasynch)) {
         callback_interception_exit();
     }
 #endif
     d_r_link_exit();
     fcache_exit();
     d_r_monitor_exit();
     synch_exit();
     d_r_arch_exit(IF_WINDOWS(detach_stacked_callbacks));
 #ifdef CALL_PROFILE
     /* above os_exit to avoid eventlog_mutex trigger if we're the first to
      * create a log file
      */
     profile_callers_exit();
 #endif
     os_fast_exit();
     os_slow_exit();
     native_exec_exit(); /* before vm_areas_exit for using dynamo_areas */
     vm_areas_exit();
     perscache_slow_exit(); /* fast called in dynamo_process_exit_with_thread_info() */
     modules_exit();        /* after aslr_exit() from os_slow_exit(),
                             * after vm_areas & perscache exits */
     moduledb_exit();       /* before heap_exit */
 #ifdef HOT_PATCHING_INTERFACE
     if (DYNAMO_OPTION(hot_patching))
         hotp_exit();
 #endif
 #ifdef WINDOWS
     /* Free exception stack before calling heap_exit */
     stack_free(exception_stack, EXCEPTION_STACK_SIZE);
     exception_stack = NULL;
 #endif
     config_heap_exit();
     d_r_heap_exit();
     vmm_heap_exit();
     diagnost_exit();
     data_section_exit();
     /* Funny dependences: options exit just frees lock, not destroying
      * any options that are needed for other exits, so do it prior to
      * checking locks in debug build.  We have a separate options_detach()
      * which resets options for re-attach.
      */
     options_exit();
     utils_exit();
     d_r_config_exit();

 #ifdef KSTATS
     kstat_exit();
 #endif

     DELETE_LOCK(all_threads_lock);
     DELETE_LOCK(thread_initexit_lock);

     DOLOG(1, LOG_STATS, {
         /* dump after cleaning up to make it easy to check if stats that
          * are inc-ed and dec-ed actually come down to 0
          */
         dump_global_stats(false);
     });
     if (INTERNAL_OPTION(rstats_to_stderr))
         dump_global_rstats_to_stderr();

     statistics_exit();
 #ifdef DEBUG
 #    ifdef DEADLOCK_AVOIDANCE
     ASSERT(locks_not_closed() == 0);
 #    endif
     dynamo_exited_log_and_stats = true;
     if (main_logfile != STDERR) {
         /* do it this way just in case someone tries to log to the global file
          * right now */
         file_t file_temp = main_logfile;
         main_logfile = INVALID_FILE;
         close_log_file(file_temp);
     }
 #else
 #    ifdef DEADLOCK_AVOIDANCE
     ASSERT(locks_not_closed() == 0);
 #    endif
 #endif /* DEBUG */

     dynamo_initialized = false;
     dynamo_started = false;
     return SUCCESS;
 }

 /* NOINLINE because dynamorio_app_exit is a stopping point. */
 NOINLINE int
 dynamorio_app_exit(void)
 {
     return dynamo_process_exit();
 }

 /* synchs with all threads using synch type synch_res.
  * also sets dynamo_exited to true.
  * does not resume the threads but does release the thread_initexit_lock.
  */
 static void
 synch_with_threads_at_exit(thread_synch_state_t synch_res, bool pre_exit)
 {
     int num_threads;
     thread_record_t **threads;
     DEBUG_DECLARE(bool ok;)
     /* If we fail to suspend a thread (e.g., privilege
      * problems) ignore it. XXX: retry instead?
      */
     uint flags = THREAD_SYNCH_SUSPEND_FAILURE_IGNORE;
     if (pre_exit) {
         /* i#297: we only synch client threads after process exit event. */
         flags |= THREAD_SYNCH_SKIP_CLIENT_THREAD;
     }
     LOG(GLOBAL, LOG_TOP | LOG_THREADS, 1,
         "\nsynch_with_threads_at_exit: cleaning up %d un-terminated threads\n",
         d_r_get_num_threads());

 #ifdef WINDOWS
     /* make sure client nudges are finished */
     wait_for_outstanding_nudges();
 #endif

     /* xref case 8747, requesting suspended is preferable to terminated and it
      * doesn't make a difference here which we use (since the process is about
      * to die).
      * On Linux, however, we do not have dependencies on OS thread
      * properties like we do on Windows (TEB, etc.), and our suspended
      * threads use their sigstacks and ostd data structs, making cleanup
      * while still catching other leaks more difficult: thus it's
      * simpler to terminate and then clean up.  FIXME: by terminating
      * we'll raise SIGCHLD that may not have been raised natively if the
      * whole group went down in a single SYS_exit_group.  Instead we
      * could have the suspended thread move from the sigstack-reliant
      * loop to a stack-free loop (xref i#95).
      */
     IF_UNIX(dynamo_exiting = true;) /* include execve-exited vfork threads */
     DEBUG_DECLARE(ok =)
     synch_with_all_threads(synch_res, &threads, &num_threads,
                            /* Case 6821: other synch-all-thread uses that
                             * only care about threads carrying fcache
                             * state can ignore us
                             */
                            THREAD_SYNCH_NO_LOCKS_NO_XFER, flags);
     ASSERT(ok);
     ASSERT(threads == NULL && num_threads == 0); /* We asked for CLEANED */
     /* the synch_with_all_threads function grabbed the
      * thread_initexit_lock for us! */
     /* do this now after all threads we know about are killed and
      * while we hold the thread_initexit_lock so any new threads that
      * are waiting on it won't get in our way (see thread_init()) */
     dynamo_exited = true;
     end_synch_with_all_threads(threads, num_threads, false /*don't resume*/);
 }

 static thread_synch_state_t
 exit_synch_state(void)
 {
     thread_synch_state_t synch_res = IF_WINDOWS_ELSE(THREAD_SYNCH_SUSPENDED_AND_CLEANED,
                                                      THREAD_SYNCH_TERMINATED_AND_CLEANED);
 #if defined(DR_APP_EXPORTS) && defined(UNIX)
     if (dr_api_exit) {
         /* Don't terminate the app's threads in case the app plans to continue
          * after dr_app_cleanup().  Note that today we don't fully support that
          * anyway: the app should use dr_app_stop_and_cleanup() whose detach
          * code won't come here.
          */
         synch_res = THREAD_SYNCH_SUSPENDED_AND_CLEANED;
     }
 #endif
     return synch_res;
 }

 #ifdef DEBUG
 /* cleanup after the application has exited */
 static int
 dynamo_process_exit_cleanup(void)
 {
     /* CAUTION: this should only be invoked after all app threads have stopped */
     if (!dynamo_exited && !INTERNAL_OPTION(nullcalls)) {
         APP_EXPORT_ASSERT(dynamo_initialized, "Improper DynamoRIO initialization");

         /* we deliberately do NOT clean up d_r_initstack (which was
          * allocated using a separate mmap and so is not part of some
          * large unit that is de-allocated), as it is used in special
          * circumstances to call us...FIXME: is this memory leak ok?
          * is there a better solution besides assuming the app stack?
          */

 #    ifdef SIDELINE
         if (dynamo_options.sideline) {
             /* exit now to make thread cleanup simpler */
             sideline_exit();
         }
 #    endif

         /* perform exit tasks that require full thread data structs */
         dynamo_process_exit_with_thread_info();

         if (INTERNAL_OPTION(single_privileged_thread)) {
             d_r_mutex_unlock(&thread_initexit_lock);
         }

         /* if ExitProcess called before all threads terminated, they won't
          * all have gone through dynamo_thread_exit, so clean them up now
          * so we can get stats about them
          *
          * we don't check control_all_threads b/c we're just killing
          * the threads we know about here
          */
         synch_with_threads_at_exit(exit_synch_state(), true /*pre-exit*/);
         /* now that APC interception point is unpatched and
          * dynamorio_exited is set and we've killed all the theads we know
          * about, assumption is that no other threads will be running in
          * dynamorio code from here on out (esp. when we get into shared exit)
          * that will do anything that could be dangerous (could possibly be
          * a thread in the APC interception code prior to reaching thread_init
          * but it will only global log and do thread_lookup which should be
          * safe throughout) */

         /* In order to pass the client a dcontext in the process exit event
          * we do some thread cleanup early for the final thread so we can delay
          * the rest (PR 536058).  This is a little risky in that we
          * clean up dcontext->fragment_field, which is used for lots of
          * things like couldbelinking (and thus we have to disable some API
          * routines in the thread exit event: i#1989).
          */
         dynamo_thread_exit_pre_client(get_thread_private_dcontext(), d_r_get_thread_id());

 #    ifdef WINDOWS
         /* FIXME : our call un-interception isn't atomic so (miniscule) chance
          * of something going wrong if new thread is just hitting its init APC
          */
         /* w/ the app's loader we must remove our LdrUnloadDll hook
          * before we unload the client lib (and thus we miss client
          * exit crashes): xref PR 200207.
          */
         if (!INTERNAL_OPTION(noasynch) && !INTERNAL_OPTION(private_loader)) {
             callback_interception_unintercept();
         }
 #    else  /* UNIX */
         unhook_vsyscall();
 #    endif /* UNIX */

         return dynamo_shared_exit(NULL /* not detaching */
                                       _IF_WINDOWS(false /* not detaching */));
     }
     return SUCCESS;
 }
 #endif /* DEBUG */

 int
 dynamo_nullcalls_exit(void)
 {
     /* this routine is used when nullcalls is turned on
      * simply to get perfctr numbers in a log file
      */
     ASSERT(INTERNAL_OPTION(nullcalls));
 #ifdef PAPI
     hardware_perfctr_exit();
 #endif

 #ifdef DEBUG
     if (main_logfile != STDERR) {
         close_log_file(main_logfile);
         main_logfile = INVALID_FILE;
     }
 #endif /* DEBUG */

     dynamo_exited = true;
     return SUCCESS;
 }

 /* called when we see that the process is about to exit */
 int
 dynamo_process_exit(void)
 {
 #ifndef DEBUG
     bool each_thread;
 #endif
     SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
     synchronize_dynamic_options();
     SYSLOG(SYSLOG_INFORMATION, INFO_PROCESS_STOP, 2, get_application_name(),
            get_application_pid());
 #ifdef DEBUG
     if (!dynamo_exited) {
         if (INTERNAL_OPTION(nullcalls)) {
             /* if nullcalls is on we still do perfctr stats, and this is
              * the only place we can print them out and exit
              */
             dynamo_nullcalls_exit();
         } else {
             /* we don't check automatic_startup -- even if the app_
              * interface is used, we are about to be gone from the process
              * address space, so we clean up now
              */
             LOG(GLOBAL, LOG_TOP, 1,
                 "\ndynamo_process_exit from thread " TIDFMT " -- cleaning up dynamo\n",
                 d_r_get_thread_id());
             dynamo_process_exit_cleanup();
         }
     }

     return SUCCESS;

 #else
     if (dynamo_exited)
         return SUCCESS;

     /* don't need to do much!
      * we didn't create any IPC objects or anything that might be persistent
      * beyond our death, we're not holding any systemwide locks, etc.
      */

     /* It is not clear whether the Event Log service handles unterminated connections */

     /* Do we need profile data for each thread?
      * Note that windows prof_pcs duplicates the thread walk in d_r_os_exit()
      * FIXME: should combine that thread walk with this one
      */
     each_thread = TRACEDUMP_ENABLED();
 #    ifdef UNIX
     each_thread = each_thread || INTERNAL_OPTION(profile_pcs);
 #    endif
 #    ifdef KSTATS
     each_thread = each_thread || DYNAMO_OPTION(kstats);
 #    endif
     each_thread = each_thread ||
         /* If we don't need a thread exit event, avoid the possibility of
          * racy crashes (PR 470957) by not calling instrument_thread_exit()
          */
         (!INTERNAL_OPTION(nullcalls) && dr_thread_exit_hook_exists() &&
          !DYNAMO_OPTION(skip_thread_exit_at_exit));

     if (DYNAMO_OPTION(synch_at_exit)
         /* by default we synch if any exit event exists */
         || (!DYNAMO_OPTION(multi_thread_exit) && dr_exit_hook_exists()) ||
         (!DYNAMO_OPTION(skip_thread_exit_at_exit) && dr_thread_exit_hook_exists())) {
         /* Needed primarily for clients but technically all configurations
          * can have racy crashes at exit time (xref PR 470957).
          */
         synch_with_threads_at_exit(exit_synch_state(), true /*pre-exit*/);
     } else
         dynamo_exited = true;

     if (each_thread) {
         thread_record_t **threads;
         int num, i;
         d_r_mutex_lock(&thread_initexit_lock);
         get_list_of_threads(&threads, &num);

         for (i = 0; i < num; i++) {
             if (IS_CLIENT_THREAD(threads[i]->dcontext))
                 continue;
             /* FIXME: separate trace dump from rest of fragment cleanup code */
             if (TRACEDUMP_ENABLED() || true) {
                 /* We always want to call this for CI builds so we can get the
                  * dr_fragment_deleted() callbacks.
                  */
                 fragment_thread_exit(threads[i]->dcontext);
             }
 #    ifdef UNIX
             if (INTERNAL_OPTION(profile_pcs))
                 pcprofile_thread_exit(threads[i]->dcontext);
 #    endif
 #    ifdef KSTATS
             if (DYNAMO_OPTION(kstats))
                 kstat_thread_exit(threads[i]->dcontext);
 #    endif
             /* Inform client of all thread exits */
             if (!INTERNAL_OPTION(nullcalls) && !DYNAMO_OPTION(skip_thread_exit_at_exit)) {
                 instrument_thread_exit_event(threads[i]->dcontext);
                 /* i#1617: ensure we do all cleanup of priv libs */
                 if (threads[i]->id != d_r_get_thread_id()) /* i#1617: must delay this */
                     loader_thread_exit(threads[i]->dcontext);
             }
         }
         global_heap_free(threads,
                          num * sizeof(thread_record_t *) HEAPACCT(ACCT_THREAD_MGT));
         d_r_mutex_unlock(&thread_initexit_lock);
     }

     /* PR 522783: must be before we clear dcontext (for clients)! */
     /* must also be prior to fragment_exit so we actually freeze pcaches (i#703) */
     dynamo_process_exit_with_thread_info();

     /* FIXME: separate trace dump from rest of fragment cleanup code.  For client
      * interface we need to call fragment_exit to get all the fragment deleted events. */
     if (TRACEDUMP_ENABLED() || dr_fragment_deleted_hook_exists())
         fragment_exit();

     /* Inform client of process exit */
     if (!INTERNAL_OPTION(nullcalls)) {
 #    ifdef WINDOWS
         /* instrument_exit() unloads the client library, so make sure
          * LdrUnloadDll isn't hooked if using the app loader.
          */
         if (!INTERNAL_OPTION(noasynch) && !INTERNAL_OPTION(private_loader)) {
             callback_interception_unintercept();
         }
 #    endif
 #    ifdef UNIX
         /* i#2976: unhook prior to client exit if modules are being watched */
         if (dr_modload_hook_exists())
             unhook_vsyscall();
 #    endif
         /* Must be after fragment_exit() so that the client gets all the
          * fragment_deleted() callbacks (xref PR 228156).  FIXME - might be issues
          * with the client trying to use api routines that depend on fragment state.
          */
         instrument_exit_event();

         /* We only need do a second synch-all if there are sideline client threads. */
         if (d_r_get_num_threads() > 1)
             synch_with_threads_at_exit(exit_synch_state(), false /*post-exit*/);
         dynamo_exited_all_other_threads = true;

         /* i#1617: We need to call client library fini routines for global
          * destructors, etc.
          */
         if (!INTERNAL_OPTION(nullcalls) && !DYNAMO_OPTION(skip_thread_exit_at_exit))
             loader_thread_exit(get_thread_private_dcontext());
         /* This will unload client libs, which we delay until after they receive their
          * thread exit calls in loader_thread_exit().
          */
         instrument_exit();
         loader_exit();

         /* for -private_loader we do this here to catch more exit-time crashes */
 #    ifdef WINDOWS
         if (!INTERNAL_OPTION(noasynch) && INTERNAL_OPTION(private_loader))
             callback_interception_unintercept();
 #    endif
     }
     fragment_exit_post_sideline();

 #    ifdef CALL_PROFILE
     profile_callers_exit();
 #    endif
 #    ifdef KSTATS
     if (DYNAMO_OPTION(kstats))
         kstat_exit();
 #    endif
     /* so make sure eventlog connection is terminated (if present)  */
     os_fast_exit();

     if (INTERNAL_OPTION(rstats_to_stderr))
         dump_global_rstats_to_stderr();

     return SUCCESS;
 #endif /* !DEBUG */
 }

 void
 dynamo_exit_post_detach(void)
 {
     /* i#2157: best-effort re-init in case of re-attach */

     do_once_generation++; /* Increment the generation in case we re-attach */

     dynamo_initialized = false;
     dynamo_options_initialized = false;
     dynamo_heap_initialized = false;
     automatic_startup = false;
     control_all_threads = false;
     dr_api_entry = false;
     dr_api_exit = false;
 #ifdef UNIX
     dynamo_exiting = false;
 #endif
     dynamo_exited = false;
     dynamo_exited_all_other_threads = false;
     dynamo_exited_and_cleaned = false;
 #ifdef DEBUG
     dynamo_exited_log_and_stats = false;
 #endif
     dynamo_resetting = false;
 #ifdef UNIX
     post_execve = false;
 #endif
     vm_areas_post_exit();
     heap_post_exit();
 }

 dcontext_t *
 create_new_dynamo_context(bool initial, byte *dstack_in, priv_mcontext_t *mc)
 {
     dcontext_t *dcontext;
     size_t alloc = sizeof(dcontext_t) + proc_get_cache_line_size();
     void *alloc_start =
         (void *)((TEST(SELFPROT_GLOBAL, dynamo_options.protect_mask) &&
                   !TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
                      ?
                      /* if protecting global but not dcontext, put whole thing in unprot
                         mem */
                      global_unprotected_heap_alloc(alloc HEAPACCT(ACCT_OTHER))
                      : global_heap_alloc(alloc HEAPACCT(ACCT_OTHER)));
     dcontext = (dcontext_t *)proc_bump_to_end_of_cache_line((ptr_uint_t)alloc_start);
     ASSERT(proc_is_cache_aligned(dcontext));
 #ifdef X86
     /* 264138: ensure xmm/ymm slots are aligned so we can use vmovdqa */
     ASSERT(ALIGNED(get_mcontext(dcontext)->simd, ZMM_REG_SIZE));
     /* also ensure we don't have extra padding beyond x86.asm defines */
     ASSERT(sizeof(priv_mcontext_t) ==
            IF_X64_ELSE(18, 10) * sizeof(reg_t) + PRE_XMM_PADDING +
                MCXT_TOTAL_SIMD_SLOTS_SIZE + MCXT_TOTAL_OPMASK_SLOTS_SIZE);
 #elif defined(ARM)
     /* FIXME i#1551: add arm alignment check if any */
 #endif /* X86/ARM */

     /* Put here all one-time dcontext field initialization
      * Make sure to update create_callback_dcontext to shared
      * fields across callback dcontexts for the same thread.
      */
     /* must set to 0 so can tell if initialized for callbacks! */
     memset(dcontext, 0x0, sizeof(dcontext_t));
     dcontext->allocated_start = alloc_start;

     /* we share a single dstack across all callbacks */
     if (initial) {
         /* DrMi#1723: our dstack needs to be at a higher address than the app
          * stack.  If mc passed, use its xsp; else use cur xsp (initial thread
          * is on the app stack here: xref i#1105), for lower bound for dstack.
          */
         byte *app_xsp;
         if (mc == NULL)
             GET_STACK_PTR(app_xsp);
         else
             app_xsp = (byte *)mc->xsp;
         if (dstack_in == NULL) {
             dcontext->dstack = (byte *)stack_alloc(DYNAMORIO_STACK_SIZE, app_xsp);
         } else
             dcontext->dstack = dstack_in; /* xref i#149/PR 403015 */
 #ifdef WINDOWS
         DOCHECK(1, {
             if (dcontext->dstack < app_xsp)
                 SYSLOG_INTERNAL_WARNING_ONCE("dstack is below app xsp");
         });
 #endif
     } else {
         /* dstack may be pre-allocated only at thread init, not at callback */
         ASSERT(dstack_in == NULL);
     }
     if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) {
         dcontext->upcontext.separate_upcontext = global_unprotected_heap_alloc(
             sizeof(unprotected_context_t) HEAPACCT(ACCT_OTHER));
         /* don't need to initialize upcontext */
         LOG(GLOBAL, LOG_TOP, 2, "new dcontext=" PFX ", dcontext->upcontext=" PFX "\n",
             dcontext, dcontext->upcontext.separate_upcontext);
         dcontext->upcontext_ptr = dcontext->upcontext.separate_upcontext;
     } else
         dcontext->upcontext_ptr = &(dcontext->upcontext.upcontext);
 #ifdef HOT_PATCHING_INTERFACE
     /* Set the hot patch exception state to be empty/unused. */
     DODEBUG(memset(&dcontext->hotp_excpt_state, -1, sizeof(dr_jmp_buf_t)););
 #endif
     ASSERT(dcontext->try_except.try_except_state == NULL);

     DODEBUG({ dcontext->logfile = INVALID_FILE; });
     dcontext->owning_thread = d_r_get_thread_id();
 #ifdef UNIX
     dcontext->owning_process = get_process_id();
 #endif
     /* thread_record is set in add_thread */
     /* all of the thread-private fcache and hashtable fields are shared
      * among all dcontext instances of a thread, so the caller must
      * set those fields
      */
     /* rest of dcontext initialization happens in initialize_dynamo_context(),
      * which is executed for each dr_app_start() and each
      * callback start
      */
     return dcontext;
 }

 static void
 delete_dynamo_context(dcontext_t *dcontext, bool free_stack)
 {
     if (free_stack) {
         ASSERT(dcontext->dstack != NULL);
         ASSERT(!is_currently_on_dstack(dcontext));
         LOG(GLOBAL, LOG_THREADS, 1, "Freeing DR stack " PFX "\n", dcontext->dstack);
         stack_free(dcontext->dstack, DYNAMORIO_STACK_SIZE);
     } /* else will be cleaned up by caller */

     ASSERT(dcontext->try_except.try_except_state == NULL);

     if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) {
         global_unprotected_heap_free(dcontext->upcontext.separate_upcontext,
                                      sizeof(unprotected_context_t) HEAPACCT(ACCT_OTHER));
     }
     if (TEST(SELFPROT_GLOBAL, dynamo_options.protect_mask) &&
         !TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) {
         /* if protecting global but not dcontext, we put whole thing in unprot mem */
         global_unprotected_heap_free(dcontext->allocated_start,
                                      sizeof(dcontext_t) +
                                          proc_get_cache_line_size() HEAPACCT(ACCT_OTHER));
     } else {
         global_heap_free(dcontext->allocated_start,
                          sizeof(dcontext_t) +
                              proc_get_cache_line_size() HEAPACCT(ACCT_OTHER));
     }
 }

 /* This routine is called not only at thread initialization,
  * but for every callback, etc. that gets a fresh execution
  * environment!
  */
 void
 initialize_dynamo_context(dcontext_t *dcontext)
 {
     /* we can't just zero out the whole thing b/c we have persistent state
      * (fields kept across callbacks, like dstack, module-private fields, next &
      * prev, etc.)
      */
     memset(dcontext->upcontext_ptr, 0, sizeof(unprotected_context_t));
     dcontext->initialized = true;
     dcontext->whereami = DR_WHERE_APP;
     dcontext->next_tag = NULL;
     dcontext->native_exec_postsyscall = NULL;
     memset(dcontext->native_retstack, 0, sizeof(dcontext->native_retstack));
     dcontext->native_retstack_cur = 0;
     dcontext->isa_mode = DEFAULT_ISA_MODE;
 #ifdef ARM
     dcontext->encode_state[0] = 0;
     dcontext->encode_state[1] = 0;
     dcontext->decode_state[0] = 0;
     dcontext->decode_state[1] = 0;
 #endif
     dcontext->sys_num = 0;
 #ifdef WINDOWS
     dcontext->app_errno = 0;
 #    ifdef DEBUG
     dcontext->is_client_thread_exiting = false;
 #    endif
     dcontext->sys_param_base = NULL;
     /* always initialize aslr_context */
     dcontext->aslr_context.sys_aslr_clobbered = 0;
     dcontext->aslr_context.randomized_section_handle = INVALID_HANDLE_VALUE;
     dcontext->aslr_context.original_image_section_handle = INVALID_HANDLE_VALUE;
     dcontext->aslr_context.original_section_base = ASLR_INVALID_SECTION_BASE;
 #    ifdef DEBUG
     dcontext->aslr_context.last_app_section_handle = INVALID_HANDLE_VALUE;
 #    endif
     /* note that aslr_context.last_child_padded is preserved across callbacks */
     dcontext->ignore_enterexit = false;
 #else
     dcontext->sys_param0 = 0;
     dcontext->sys_param1 = 0;
     dcontext->sys_param2 = 0;
 #endif

 #ifdef UNIX
     dcontext->signals_pending = 0;
 #endif

     /* all thread-private fields are initialized in dynamo_thread_init
      * or in create_callback_dcontext because they must be initialized differently
      * in those two cases
      */

     set_last_exit(dcontext, (linkstub_t *)get_starting_linkstub());

 #ifdef PROFILE_RDTSC
     dcontext->start_time = (uint64)0;
     dcontext->prev_fragment = NULL;
     dcontext->cache_frag_count = (uint64)0;
     {
         int i;
         for (i = 0; i < 10; i++) {
             dcontext->cache_time[i] = (uint64)0;
             dcontext->cache_count[i] = (uint64)0;
         }
     }
 #endif
 #ifdef DEBUG
     dcontext->in_opnd_disassemble = false;
 #endif
 #ifdef WINDOWS
     /* Other pieces of DR -- callback & APC handling, detach -- test
      * asynch_target to determine where the next app pc to execute is
      * stored. Init it to 0 to indicate that this context's most recent
      * syscall was not executed from handle_system_call().
      */
     dcontext->asynch_target = NULL;
     /* next_saved and prev_unused are zeroed out when dcontext is
      * created; we shouldn't zero them here, they may have valid data
      */
     dcontext->valid = true;
 #endif
 #ifdef HOT_PATCHING_INTERFACE
     dcontext->nudge_thread = false; /* Fix for case 5367. */
 #endif
 #ifdef CHECK_RETURNS_SSE2
     /* initialize sse2 index with 0
      * go ahead and use eax, it's dead (about to return)
      */
 #    ifdef UNIX
     asm("movl $0, %eax");
     asm("pinsrw $7,%eax,%xmm7");
 #    else
 #        error NYI
 #    endif
 #endif
     /* We don't need to initialize dcontext->coarse_exit as it is only
      * read when last_exit indicates a coarse exit, which sets the fields.
      */
     dcontext->go_native = false;
 }

 #ifdef WINDOWS
 /* on windows we use a new dcontext for each callback context */
 dcontext_t *
 create_callback_dcontext(dcontext_t *old_dcontext)
 {
     dcontext_t *new_dcontext = create_new_dynamo_context(false, NULL, NULL);
     new_dcontext->valid = false;
     /* all of these fields are shared among all dcontexts of a thread: */
     new_dcontext->owning_thread = old_dcontext->owning_thread;
 #    ifdef UNIX
     new_dcontext->owning_process = old_dcontext->owning_process;
 #    endif
     new_dcontext->thread_record = old_dcontext->thread_record;
     /* now that we have clean stack usage we can share a single stack */
     ASSERT(old_dcontext->dstack != NULL);
     new_dcontext->dstack = old_dcontext->dstack;
     new_dcontext->isa_mode = old_dcontext->isa_mode;
     new_dcontext->link_field = old_dcontext->link_field;
     new_dcontext->monitor_field = old_dcontext->monitor_field;
     new_dcontext->fcache_field = old_dcontext->fcache_field;
     new_dcontext->fragment_field = old_dcontext->fragment_field;
     new_dcontext->heap_field = old_dcontext->heap_field;
     new_dcontext->vm_areas_field = old_dcontext->vm_areas_field;
     new_dcontext->os_field = old_dcontext->os_field;
     new_dcontext->synch_field = old_dcontext->synch_field;
     /* case 8958: copy win32_start_addr in case we produce a forensics file
      * from within a callback.
      */
     new_dcontext->win32_start_addr = old_dcontext->win32_start_addr;
     /* FlsData is persistent across callbacks */
     new_dcontext->app_fls_data = old_dcontext->app_fls_data;
     new_dcontext->priv_fls_data = old_dcontext->priv_fls_data;
     new_dcontext->app_nt_rpc = old_dcontext->app_nt_rpc;
     new_dcontext->priv_nt_rpc = old_dcontext->priv_nt_rpc;
     new_dcontext->app_nls_cache = old_dcontext->app_nls_cache;
     new_dcontext->priv_nls_cache = old_dcontext->priv_nls_cache;
     new_dcontext->app_static_tls = old_dcontext->app_static_tls;
     new_dcontext->priv_static_tls = old_dcontext->priv_static_tls;
     new_dcontext->app_stack_limit = old_dcontext->app_stack_limit;
     new_dcontext->app_stack_base = old_dcontext->app_stack_base;
     new_dcontext->teb_base = old_dcontext->teb_base;
 #    ifdef UNIX
     new_dcontext->signal_field = old_dcontext->signal_field;
     new_dcontext->pcprofile_field = old_dcontext->pcprofile_field;
 #    endif
     new_dcontext->private_code = old_dcontext->private_code;
     new_dcontext->client_data = old_dcontext->client_data;
 #    ifdef DEBUG
     new_dcontext->logfile = old_dcontext->logfile;
     new_dcontext->thread_stats = old_dcontext->thread_stats;
 #    endif
 #    ifdef DEADLOCK_AVOIDANCE
     new_dcontext->thread_owned_locks = old_dcontext->thread_owned_locks;
 #    endif
 #    ifdef KSTATS
     new_dcontext->thread_kstats = old_dcontext->thread_kstats;
 #    endif
     /* at_syscall is real time based, not app context based, so shared
      *
      * FIXME: Yes need to share when swapping at NtCallbackReturn, but
      * want to keep old so when return from cb will do post-syscall for
      * syscall that triggered cb in the first place!
      * Plus, new cb calls initialize_dynamo_context(), which clears this field
      * anyway!  This all works now b/c we don't have alertable syscalls
      * that we do post-syscall processing on.
      */
     new_dcontext->upcontext_ptr->at_syscall = old_dcontext->upcontext_ptr->at_syscall;
 #    ifdef HOT_PATCHING_INTERFACE /* Fix for case 5367. */
     /* hotp_excpt_state should be unused at this point.  If it is used, it can
      * be only because a hot patch made a system call with a callback.  This is
      * a bug because hot patches can't do system calls, let alone one with
      * callbacks.
      */
     DOCHECK(1, {
         dr_jmp_buf_t empty;
         memset(&empty, -1, sizeof(dr_jmp_buf_t));
         ASSERT(memcmp(&old_dcontext->hotp_excpt_state, &empty, sizeof(dr_jmp_buf_t)) ==
                0);
     });
     new_dcontext->nudge_thread = old_dcontext->nudge_thread;
 #    endif
     /* our exceptions should be handled within one DR context switch */
     ASSERT(old_dcontext->try_except.try_except_state == NULL);
     new_dcontext->local_state = old_dcontext->local_state;
 #    ifdef WINDOWS
     new_dcontext->aslr_context.last_child_padded =
         old_dcontext->aslr_context.last_child_padded;
 #    endif

     LOG(new_dcontext->logfile, LOG_TOP, 2, "made new dcontext " PFX " (old=" PFX ")\n",
         new_dcontext, old_dcontext);
     return new_dcontext;
 }
 #endif

 bool
 is_thread_initialized(void)
 {
 #if defined(UNIX) && defined(HAVE_TLS)
     /* We don't want to pay the d_r_get_thread_id() cost on every
      * get_thread_private_dcontext() when we only really need the
      * check for this call here, so we explicitly check.
      */
     if (get_tls_thread_id() != get_sys_thread_id())
         return false;
 #endif
     return (get_thread_private_dcontext() != NULL);
 }

 bool
 is_thread_known(thread_id_t tid)
 {
     return (thread_lookup(tid) != NULL);
 }

 #ifdef UNIX
 /* i#237/PR 498284: a thread about to execute SYS_execve should be considered
  * exited, but we can't easily clean up it for real immediately
  */
 void
 mark_thread_execve(thread_record_t *tr, bool execve)
 {
     ASSERT((execve && !tr->execve) || (!execve && tr->execve));
     tr->execve = execve;
     d_r_mutex_lock(&all_threads_lock);
     if (execve) {
         /* since we free on a second vfork we should never accumulate
          * more than one
          */
         ASSERT(num_execve_threads == 0);
         num_execve_threads++;
     } else {
         ASSERT(num_execve_threads > 0);
         num_execve_threads--;
     }
     d_r_mutex_unlock(&all_threads_lock);
 }
 #endif /* UNIX */

 int
 d_r_get_num_threads(void)
 {
     return num_known_threads IF_UNIX(-num_execve_threads);
 }

 bool
 is_last_app_thread(void)
 {
     return (d_r_get_num_threads() == get_num_client_threads() + 1);
 }

 /* This routine takes a snapshot of all the threads known to DR,
  * NOT LIMITED to those currently under DR control!
  * It returns an array of thread_record_t* and the length of the array
  * The caller must free the array using global_heap_free
  * The caller must hold the thread_initexit_lock to ensure that threads
  * are not created or destroyed before the caller is done with the list
  * The caller CANNOT be could_be_linking, else a deadlock with flushing
  * can occur (unless the caller is the one flushing)
  */
 static void
 get_list_of_threads_common(thread_record_t ***list,
                            int *num _IF_UNIX(bool include_execve))
 {
     int i, cur = 0, max_num;
     thread_record_t *tr;
     thread_record_t **mylist;

     /* Only a flushing thread can get the thread snapshot while being
      * couldbelinking -- else a deadlock w/ flush!
      * FIXME: this assert should be on any acquisition of thread_initexit_lock!
      */
     ASSERT(is_self_flushing() || !is_self_couldbelinking());
     ASSERT(all_threads != NULL);
     ASSERT_OWN_MUTEX(true, &thread_initexit_lock);

     d_r_mutex_lock(&all_threads_lock);
     /* Do not include vfork threads that exited via execve, unless we're exiting */
     max_num = IF_UNIX_ELSE((include_execve || dynamo_exiting) ? num_known_threads
                                                               : d_r_get_num_threads(),
                            d_r_get_num_threads());
     mylist = (thread_record_t **)global_heap_alloc(
         max_num * sizeof(thread_record_t *) HEAPACCT(ACCT_THREAD_MGT));
     for (i = 0; i < HASHTABLE_SIZE(ALL_THREADS_HASH_BITS); i++) {
         for (tr = all_threads[i]; tr != NULL; tr = tr->next) {
             /* include those for which !tr->under_dynamo_control */
             /* don't include those that exited for execve.  there should be
              * no race b/c vfork suspends the parent.  xref i#237/PR 498284.
              */
             if (IF_UNIX_ELSE(!tr->execve || include_execve || dynamo_exiting, true)) {
                 mylist[cur] = tr;
                 cur++;
             }
         }
     }

     ASSERT(cur > 0);
     IF_WINDOWS(ASSERT(cur == max_num));
     if (cur < max_num) {
         mylist = (thread_record_t **)global_heap_realloc(
             mylist, max_num, cur, sizeof(thread_record_t *) HEAPACCT(ACCT_THREAD_MGT));
     }

     *num = cur;
     *list = mylist;
     d_r_mutex_unlock(&all_threads_lock);
 }

 void
 get_list_of_threads(thread_record_t ***list, int *num)
 {
     get_list_of_threads_common(list, num _IF_UNIX(false));
 }

 #ifdef UNIX
 void
 get_list_of_threads_ex(thread_record_t ***list, int *num, bool include_execve)
 {
     get_list_of_threads_common(list, num, include_execve);
 }
 #endif

 /* assumes caller can ensure that thread is either suspended or self to
  * avoid races
  */
 thread_record_t *
 thread_lookup(thread_id_t tid)
 {
     thread_record_t *tr;
     uint hindex;

     /* check that caller is self or has initexit_lock
      * FIXME: no way to tell who has initexit_lock
      */
     ASSERT(mutex_testlock(&thread_initexit_lock) || tid == d_r_get_thread_id());

     hindex = HASH_FUNC_BITS(tid, ALL_THREADS_HASH_BITS);
     d_r_mutex_lock(&all_threads_lock);
     if (all_threads == NULL) {
         tr = NULL;
     } else {
         tr = all_threads[hindex];
     }
     while (tr != NULL) {
         if (tr->id == tid) {
             d_r_mutex_unlock(&all_threads_lock);
             return tr;
         }
         tr = tr->next;
     }
     d_r_mutex_unlock(&all_threads_lock);
     return NULL;
 }

 /* assumes caller can ensure that thread is either suspended or self to
  * avoid races
  */
 uint
 get_thread_num(thread_id_t tid)
 {
     thread_record_t *tr = thread_lookup(tid);
     if (tr != NULL)
         return tr->num;
     else
         return 0; /* yes can't distinguish from 1st thread, who cares */
 }

 void
 add_thread(IF_WINDOWS_ELSE_NP(HANDLE hthread, process_id_t pid), thread_id_t tid,
            bool under_dynamo_control, dcontext_t *dcontext)
 {
     thread_record_t *tr;
     uint hindex;

     ASSERT(all_threads != NULL);

     /* add entry to thread hashtable */
     tr = (thread_record_t *)global_heap_alloc(sizeof(thread_record_t)
                                                   HEAPACCT(ACCT_THREAD_MGT));
 #ifdef WINDOWS
     /* we duplicate the thread pseudo-handle, this should give us full rights
      * Note that instead asking explicitly for THREAD_ALL_ACCESS or just for
      * THREAD_TERMINATE|THREAD_SUSPEND_RESUME|THREAD_GET_CONTEXT|THREAD_SET_CONTEXT
      * does not seem able to acquire more rights than simply duplicating the
      * app handle gives.
      */
     LOG(GLOBAL, LOG_THREADS, 1, "Thread %d app handle rights: " PFX "\n", tid,
         nt_get_handle_access_rights(hthread));
     duplicate_handle(NT_CURRENT_PROCESS, hthread, NT_CURRENT_PROCESS, &tr->handle, 0, 0,
                      DUPLICATE_SAME_ACCESS | DUPLICATE_SAME_ATTRIBUTES);
     /* We prob. only need TERMINATE (for kill thread), SUSPEND/RESUME/GET_CONTEXT
      * (for synchronizing), and SET_CONTEXT (+ synchronizing requirements, for
      * detach).  All access includes this and quite a bit more. */
 #    if 0
     /* eventually should be a real assert, but until we have a story for the
      * injected detach threads, have to ifdef out even the ASSERT_CURIOSITY
      * (even a syslog internal warning is prob. to noisy for QA) */
     ASSERT_CURIOSITY(TESTALL(THREAD_ALL_ACCESS, nt_get_handle_access_rights(tr->handle)));
 #    endif
     LOG(GLOBAL, LOG_THREADS, 1, "Thread %d our handle rights: " PFX "\n", tid,
         nt_get_handle_access_rights(tr->handle));
     tr->retakeover = false;
 #else
     tr->pid = pid;
     tr->execve = false;
 #endif
     tr->id = tid;
     ASSERT(tid != INVALID_THREAD_ID); /* ensure os never assigns invalid id to a thread */
     tr->under_dynamo_control = under_dynamo_control;
     tr->dcontext = dcontext;
     if (dcontext != NULL) /* we allow NULL for dr_create_client_thread() */
         dcontext->thread_record = tr;

     d_r_mutex_lock(&all_threads_lock);
     tr->num = threads_ever_count++;
     hindex = HASH_FUNC_BITS(tr->id, ALL_THREADS_HASH_BITS);
     tr->next = all_threads[hindex];
     all_threads[hindex] = tr;
     /* must be inside all_threads_lock to avoid race w/ get_list_of_threads */
     RSTATS_ADD_PEAK(num_threads, 1);
     RSTATS_INC(num_threads_created);
     num_known_threads++;
     d_r_mutex_unlock(&all_threads_lock);
 }

 /* return false if couldn't find the thread */
 bool
 remove_thread(IF_WINDOWS_(HANDLE hthread) thread_id_t tid)
 {
     thread_record_t *tr = NULL, *prevtr;
     uint hindex = HASH_FUNC_BITS(tid, ALL_THREADS_HASH_BITS);

     ASSERT(all_threads != NULL);

     d_r_mutex_lock(&all_threads_lock);
     for (tr = all_threads[hindex], prevtr = NULL; tr; prevtr = tr, tr = tr->next) {
         if (tr->id == tid) {
             if (prevtr)
                 prevtr->next = tr->next;
             else
                 all_threads[hindex] = tr->next;
             /* must be inside all_threads_lock to avoid race w/ get_list_of_threads */
             RSTATS_DEC(num_threads);
 #ifdef UNIX
             if (tr->execve) {
                 ASSERT(num_execve_threads > 0);
                 num_execve_threads--;
             }
 #endif
             num_known_threads--;
 #ifdef WINDOWS
             close_handle(tr->handle);
 #endif
             global_heap_free(tr, sizeof(thread_record_t) HEAPACCT(ACCT_THREAD_MGT));
             break;
         }
     }
     d_r_mutex_unlock(&all_threads_lock);
     return (tr != NULL);
 }

 /* this bool is protected by reset_pending_lock */
 DECLARE_FREQPROT_VAR(static bool reset_at_nth_thread_triggered, false);

 /* thread-specific initialization
  * if dstack_in is NULL, then a dstack is allocated; else dstack_in is used
  * as the thread's dstack
  * mc can be NULL for the initial thread
  * returns -1 if current thread has already been initialized
  */
 /* On UNIX, if dstack_in != NULL, the parent of this new thread must have
  * increased uninit_thread_count.
  */
 int
 dynamo_thread_init(byte *dstack_in, priv_mcontext_t *mc, void *os_data,
                    bool client_thread)
 {
     dcontext_t *dcontext;
     /* due to lock issues (see below) we need another var */
     bool reset_at_nth_thread_pending = false;
     bool under_dynamo_control = true;
     APP_EXPORT_ASSERT(dynamo_initialized || dynamo_exited || d_r_get_num_threads() == 0 ||
                           client_thread,
                       PRODUCT_NAME " not initialized");
     if (INTERNAL_OPTION(nullcalls)) {
         ASSERT(uninit_thread_count == 0);
         return SUCCESS;
     }

     /* note that ENTERING_DR is assumed to have already happened: in apc handler
      * for win32, in new_thread_setup for linux, in main init for 1st thread
      */
 #if defined(WINDOWS) && defined(DR_APP_EXPORTS)
     /* We need to identify a thread we intercepted in its APC when we
      * take over all threads on dr_app_start().  Stack and pc checks aren't
      * simple b/c it can be in ntdll waiting on a lock.
      */
     if (dr_api_entry)
         os_take_over_mark_thread(d_r_get_thread_id());
 #endif

     /* Try to handle externally injected threads */
     if (dynamo_initialized && !bb_lock_start)
         pre_second_thread();

     /* synch point so thread creation can be prevented for critical periods */
     d_r_mutex_lock(&thread_initexit_lock);

     /* XXX i#2611: during detach, there is a race where a thread can
      * reach here on Windows despite init_apc_go_native (i#2600).
      */
     ASSERT_BUG_NUM(2611, !doing_detach);

     /* The assumption is that if dynamo_exited, then we are about to exit and
      * clean up, initializing this thread then would be dangerous, better to
      * wait here for the app to die.
      */
     /* under current implementation of process exit, can happen only under
      * debug build, or app_start app_exit interface */
     while (dynamo_exited) {
         /* logging should be safe, though might not actually result in log
          * message */
         DODEBUG_ONCE(LOG(GLOBAL, LOG_THREADS, 1,
                          "Thread %d reached initialization point while dynamo exiting, "
                          "waiting for app to exit\n",
                          d_r_get_thread_id()););
         d_r_mutex_unlock(&thread_initexit_lock);
         os_thread_yield();
         /* just in case we want to support exited and then restarted at some
          * point */
         d_r_mutex_lock(&thread_initexit_lock);
     }

     if (is_thread_initialized()) {
         d_r_mutex_unlock(&thread_initexit_lock);
 #if defined(WINDOWS) && defined(DR_APP_EXPORTS)
         if (dr_api_entry)
             os_take_over_unmark_thread(d_r_get_thread_id());
 #endif
         return -1;
     }

     os_tls_init();
     dcontext = create_new_dynamo_context(true /*initial*/, dstack_in, mc);
     initialize_dynamo_context(dcontext);
     set_thread_private_dcontext(dcontext);
     /* sanity check */
     ASSERT(get_thread_private_dcontext() == dcontext);

     /* set local state pointer for access from other threads */
     dcontext->local_state = get_local_state();

     /* set initial mcontext, if known */
     if (mc != NULL)
         *get_mcontext(dcontext) = *mc;

     /* For hotp_only, the thread should run native, not under dr.  However,
      * the core should still get control of the thread at hook points to track
      * what the application is doing & at patched points to execute hot patches.
      * It is the same for thin_client except that there are fewer hooks, only to
      * follow children.
      */
     if (RUNNING_WITHOUT_CODE_CACHE())
         under_dynamo_control = false;

     /* add entry to thread hashtable before creating logdir so have thread num.
      * otherwise we'd like to do this only after we'd fully initialized the thread, but we
      * hold the thread_initexit_lock, so nobody should be listing us -- thread_lookup
      * on other than self, or a thread list, should only be done while the initexit_lock
      * is held.  CHECK: is this always correct?  thread_lookup does have an assert
      * to try and enforce but cannot tell who has the lock.
      */
     add_thread(IF_WINDOWS_ELSE(NT_CURRENT_THREAD, get_process_id()), d_r_get_thread_id(),
                under_dynamo_control, dcontext);
 #ifdef UNIX /* i#2600: Not easy on Windows: we rely on init_apc_go_native there. */
     if (dstack_in != NULL) { /* Else not a thread creation we observed */
         ASSERT(uninit_thread_count > 0);
         ATOMIC_DEC(int, uninit_thread_count);
     }
 #endif
 #if defined(WINDOWS) && defined(DR_APP_EXPORTS)
     /* Now that the thread is in the main thread table we don't need to remember it */
     if (dr_api_entry)
         os_take_over_unmark_thread(d_r_get_thread_id());
 #endif

     LOG(GLOBAL, LOG_TOP | LOG_THREADS, 1,
         "\ndynamo_thread_init: %d thread(s) now, dcontext=" PFX ", #=%d, id=" TIDFMT
         ", pid=" PIDFMT "\n\n",
         GLOBAL_STAT(num_threads), dcontext, get_thread_num(d_r_get_thread_id()),
         d_r_get_thread_id(), get_process_id());

     DOLOG(1, LOG_STATS, { dump_global_stats(false); });
 #ifdef DEBUG
     if (d_r_stats->loglevel > 0) {
         dcontext->logfile = open_log_file(thread_logfile_name(), NULL, 0);
         print_file(dcontext->logfile, "%s\n", dynamorio_version_string);
     } else {
         dcontext->logfile = INVALID_FILE;
     }
     DOLOG(1, LOG_TOP | LOG_THREADS, {
         LOG(THREAD, LOG_TOP | LOG_THREADS, 1, PRODUCT_NAME " built with: %s\n",
             DYNAMORIO_DEFINES);
         LOG(THREAD, LOG_TOP | LOG_THREADS, 1, PRODUCT_NAME " built on: %s\n",
             dynamorio_buildmark);
     });

     LOG(THREAD, LOG_TOP | LOG_THREADS, 1, "%sTHREAD %d (dcontext " PFX ")\n\n",
         client_thread ? "CLIENT " : "", d_r_get_thread_id(), dcontext);
     LOG(THREAD, LOG_TOP | LOG_THREADS, 1,
         "DR stack is " PFX "-" PFX " (passed in " PFX ")\n",
         dcontext->dstack - DYNAMORIO_STACK_SIZE, dcontext->dstack, dstack_in);
 #endif

 #ifdef DEADLOCK_AVOIDANCE
     locks_thread_init(dcontext);
 #endif
     heap_thread_init(dcontext);
     DOSTATS({ stats_thread_init(dcontext); });
 #ifdef KSTATS
     kstat_thread_init(dcontext);
 #endif
     os_thread_init(dcontext, os_data);
     arch_thread_init(dcontext);
     synch_thread_init(dcontext);

     if (!DYNAMO_OPTION(thin_client))
         vm_areas_thread_init(dcontext);

     monitor_thread_init(dcontext);
     fcache_thread_init(dcontext);
     link_thread_init(dcontext);
     fragment_thread_init(dcontext);

     /* OS thread init after synch_thread_init and other setup can handle signals, etc. */
     os_thread_init_finalize(dcontext, os_data);

     /* This lock has served its purposes: A) a barrier to thread creation for those
      * iterating over threads, B) mutex for add_thread, and C) mutex for synch_field
      * to be set up.
      * So we release it to shrink the time spent w/ this big lock, in particular
      * to avoid holding it while running private lib thread init code (i#875).
      */
     d_r_mutex_unlock(&thread_initexit_lock);

     /* Set up client data needed in loader_thread_init for IS_CLIENT_THREAD */
     instrument_client_thread_init(dcontext, client_thread);

     loader_thread_init(dcontext);

     if (!DYNAMO_OPTION(thin_client)) {
         /* put client last, may depend on other thread inits.
          * Note that we are calling this prior to instrument_init()
          * now (PR 216936), which is required to initialize
          * the client dcontext field prior to instrument_init().
          */
         instrument_thread_init(dcontext, client_thread, mc != NULL);

 #ifdef SIDELINE
         if (dynamo_options.sideline) {
             /* wake up sideline thread -- ok to call if thread already awake */
             sideline_start();
         }
 #endif
     }

     /* must check # threads while holding thread_initexit_lock, yet cannot
      * call fcache_reset_all_caches_proactively while holding it due to
      * rank order of reset_pending_lock which we must also hold -- so we
      * set a local bool reset_at_nth_thread_pending
      */
     if (DYNAMO_OPTION(reset_at_nth_thread) != 0 && !reset_at_nth_thread_triggered &&
         (uint)d_r_get_num_threads() == DYNAMO_OPTION(reset_at_nth_thread)) {
         d_r_mutex_lock(&reset_pending_lock);
         if (!reset_at_nth_thread_triggered) {
             reset_at_nth_thread_triggered = true;
             reset_at_nth_thread_pending = true;
         }
         d_r_mutex_unlock(&reset_pending_lock);
     }

     DOLOG(1, LOG_STATS, { dump_thread_stats(dcontext, false); });

     if (reset_at_nth_thread_pending) {
         d_r_mutex_lock(&reset_pending_lock);
         /* fcache_reset_all_caches_proactively() will unlock */
         fcache_reset_all_caches_proactively(RESET_ALL);
     }
     return SUCCESS;
 }

 /* We don't free cur thread until after client exit event (PR 536058) except for
  * fragment_thread_exit().  Since this is called outside of dynamo_thread_exit()
  * on process exit we assume fine to skip enter_threadexit().
  */
 void
 dynamo_thread_exit_pre_client(dcontext_t *dcontext, thread_id_t id)
 {
     /* fcache stats needs to examine fragment state, so run it before
      * fragment exit, but real fcache exit needs to be after fragment exit
      */
 #ifdef DEBUG
     fcache_thread_exit_stats(dcontext);
 #endif
     /* must abort now to avoid deleting possibly un-deletable fragments
      * monitor_thread_exit remains later b/c of monitor_remove_fragment calls
      */
     trace_abort_and_delete(dcontext);
     fragment_thread_exit(dcontext);
     IF_WINDOWS(loader_pre_client_thread_exit(dcontext));
     instrument_thread_exit_event(dcontext);
 }

 /* thread-specific cleanup */
 /* Note : if this routine is not called by thread id, then other_thread should
  * be true and the calling thread should hold the thread_initexit_lock
  */
 static int
 dynamo_thread_exit_common(dcontext_t *dcontext, thread_id_t id,
                           IF_WINDOWS_(bool detach_stacked_callbacks) bool other_thread)
 {
     dcontext_t *dcontext_tmp;
 #ifdef WINDOWS
     dcontext_t *dcontext_next;
     int num_dcontext;
 #endif
     bool on_dstack = !other_thread && is_currently_on_dstack(dcontext);
     /* cache this now for use after freeing dcontext */
     local_state_t *local_state = dcontext->local_state;

     if (INTERNAL_OPTION(nullcalls) || dcontext == NULL)
         return SUCCESS;

     /* make sure don't get into deadlock w/ flusher */
     enter_threadexit(dcontext);

     /* synch point so thread exiting can be prevented for critical periods */
     /* see comment at start of method for other thread exit */
     if (!other_thread)
         d_r_mutex_lock(&thread_initexit_lock);

     ASSERT_OWN_MUTEX(true, &thread_initexit_lock);
 #ifdef WINDOWS
     /* need to clean up thread stack before clean up other thread data, but
      * after we're made nolinking
      */
     os_thread_stack_exit(dcontext);
     /* free the thread's application stack if requested */
     if (dcontext->free_app_stack) {
         byte *base;
         /* only used for nudge threads currently */
         ASSERT(dcontext->nudge_target == generic_nudge_target);
         if (get_stack_bounds(dcontext, &base, NULL)) {
             NTSTATUS res;
             ASSERT(base != NULL);
             res = nt_free_virtual_memory(base);
             ASSERT(NT_SUCCESS(res));
         } else {
             /* stack should be available here */
             ASSERT_NOT_REACHED();
         }
     }
 #endif

 #ifdef SIDELINE
     /* N.B.: do not clean up any data structures while sideline thread
      * is still running!  put it to sleep for duration of this routine!
      */
     if (!DYNAMO_OPTION(thin_client)) {
         if (dynamo_options.sideline) {
             /* put sideline thread to sleep */
             sideline_stop();
             /* sideline_stop will not return until sideline thread is asleep */
         }
     }
 #endif

     LOG(GLOBAL, LOG_TOP | LOG_THREADS, 1,
         "\ndynamo_thread_exit (thread #%d id=" TIDFMT "): %d thread(s) now\n\n",
         get_thread_num(id), id, GLOBAL_STAT(num_threads) - 1);

     DOLOG(1, LOG_STATS, { dump_global_stats(false); });

     LOG(THREAD, LOG_STATS | LOG_THREADS, 1, "\n## Statistics for this thread:\n");

 #ifdef PROFILE_RDTSC
     if (dynamo_options.profile_times) {
         int i;
         ASSERT(dcontext);
         LOG(THREAD, LOG_STATS | LOG_THREADS, 1, "\nTop ten cache times:\n");
         for (i = 0; i < 10; i++) {
             if (dcontext->cache_time[i] > (uint64)0) {
                 uint top_part, bottom_part;
                 divide_int64_print(dcontext->cache_time[i], kilo_hertz, false, 3,
                                    &top_part, &bottom_part);
                 LOG(THREAD, LOG_STATS | LOG_THREADS, 1,
                     "\t#%2d = %6u.%.3u ms, %9d hits\n", i + 1, top_part, bottom_part,
                     (int)dcontext->cache_count[i]);
             }
         }
         LOG(THREAD, LOG_STATS | LOG_THREADS, 1, "\n");
     }
 #endif

     /* In order to pass the client a dcontext in the process exit event
      * we do some thread cleanup early for the final thread so we can delay
      * the rest (PR 536058)
      */
     if (!dynamo_exited_and_cleaned)
         dynamo_thread_exit_pre_client(dcontext, id);
     /* PR 243759: don't free client_data until after all fragment deletion events */
     if (!DYNAMO_OPTION(thin_client))
         instrument_thread_exit(dcontext);

     /* i#920: we can't take segment/timer/asynch actions for other threads.
      * This must be called after dynamo_thread_exit_pre_client where
      * we called event callbacks.
      */
     if (!other_thread) {
         dynamo_thread_not_under_dynamo(dcontext);
 #ifdef WINDOWS
         /* We don't do this inside os_thread_not_under_dynamo b/c we do it in
          * context switches.  os_loader_exit() will call this, but it has no
          * dcontext, so it won't swap internal TEB fields.
          */
         swap_peb_pointer(dcontext, false /*to app*/);
 #endif
     }

     /* We clean up priv libs prior to setting tls dc to NULL so we can use
      * TRY_EXCEPT when calling the priv lib entry routine
      */
     if (!dynamo_exited ||
         (other_thread &&
          (IF_WINDOWS_ELSE(!doing_detach, true) ||
           dcontext->owning_thread != d_r_get_thread_id()))) /* else already did this */
         loader_thread_exit(dcontext);

     /* set tls dc to NULL prior to cleanup, to avoid problems handling
      * alarm signals received during cleanup (we'll suppress if tls
      * dc==NULL which seems the right thing to do: not worth our
      * effort to pass to another thread if thread-group-shared alarm,
      * and if thread-private then thread would have exited soon
      * anyway).  see PR 596127.
      */
     /* make sure we invalidate the dcontext before releasing the memory  */
     /* when cleaning up other threads, we cannot set their dcs to null,
      * but we only do this at dynamorio_app_exit so who cares
      */
     /* This must be called after instrument_thread_exit, which uses
      * get_thread_private_dcontext for app/dr state checks.
      */
     if (id == d_r_get_thread_id())
         set_thread_private_dcontext(NULL);

     fcache_thread_exit(dcontext);
     link_thread_exit(dcontext);
     monitor_thread_exit(dcontext);
     if (!DYNAMO_OPTION(thin_client))
         vm_areas_thread_exit(dcontext);
     synch_thread_exit(dcontext);
     arch_thread_exit(dcontext _IF_WINDOWS(detach_stacked_callbacks));
     os_thread_exit(dcontext, other_thread);
     DOLOG(1, LOG_STATS, { dump_thread_stats(dcontext, false); });
 #ifdef KSTATS
     kstat_thread_exit(dcontext);
 #endif
     DOSTATS({ stats_thread_exit(dcontext); });
     heap_thread_exit(dcontext);
 #ifdef DEADLOCK_AVOIDANCE
     locks_thread_exit(dcontext);
 #endif

 #ifdef DEBUG
     if (dcontext->logfile != INVALID_FILE && dcontext->logfile != STDERR) {
         os_flush(dcontext->logfile);
         close_log_file(dcontext->logfile);
     }
 #endif

     /* remove thread from threads hashtable */
     remove_thread(IF_WINDOWS_(NT_CURRENT_THREAD) id);

     dcontext_tmp = dcontext;
 #ifdef WINDOWS
     /* clean up all the dcs */
     num_dcontext = 0;
     /* Already at one end of list. Delete through to other end. */
     while (dcontext_tmp) {
         num_dcontext++;
         dcontext_next = dcontext_tmp->prev_unused;
         delete_dynamo_context(dcontext_tmp,
                               dcontext_tmp == dcontext /*do not free dup cb stacks*/
                                   && !on_dstack /*do not free own stack*/);
         dcontext_tmp = dcontext_next;
     }
     LOG(GLOBAL, LOG_STATS | LOG_THREADS, 1, "\tdynamo contexts used: %d\n", num_dcontext);
 #else  /* UNIX */
     delete_dynamo_context(dcontext_tmp, !on_dstack /*do not free own stack*/);
 #endif /* UNIX */
     os_tls_exit(local_state, other_thread);

 #ifdef SIDELINE
     /* see notes above -- we can now wake up sideline thread */
     if (dynamo_options.sideline && d_r_get_num_threads() > 0) {
         sideline_start();
     }
 #endif
     if (!other_thread) {
         d_r_mutex_unlock(&thread_initexit_lock);
         /* FIXME: once thread_initexit_lock is released, we're not on
          * thread list, and a terminate targeting us could kill us in the middle
          * of this call -- but this can't come before the unlock b/c the lock's
          * in the data segment!  (see case 3121)
          * (note we do not re-protect for process exit, see !dynamo_exited check
          * in exiting_dynamorio)
          */
         if (!on_dstack) {
             EXITING_DR();
             /* else, caller will clean up stack and then call EXITING_DR(),
              * probably via dynamo_thread_stack_free_and_exit(), as the stack free
              * must be done before the exit
              */
         }
     }

     return SUCCESS;
 }

 /* NOINLINE because dynamo_thread_exit is a stopping point. */
 NOINLINE int
 dynamo_thread_exit(void)
 {
     dcontext_t *dcontext = get_thread_private_dcontext();
     return dynamo_thread_exit_common(dcontext, d_r_get_thread_id(),
                                      IF_WINDOWS_(false) false);
 }

 /* NOTE : you must hold thread_initexit_lock to call this function! */
 int
 dynamo_other_thread_exit(thread_record_t *tr _IF_WINDOWS(bool detach_stacked_callbacks))
 {
     /* FIXME: Usually a safe spot for cleaning other threads should be
      * under num_exits_dir_syscall, but for now rewinding all the way
      */
     KSTOP_REWIND_DC(tr->dcontext, thread_measured);
     KSTART_DC(tr->dcontext, thread_measured);
     return dynamo_thread_exit_common(tr->dcontext, tr->id,
                                      IF_WINDOWS_(detach_stacked_callbacks) true);
 }

 /* Called from another stack to finish cleaning up a thread.
  * The final steps are to free the stack and perform the exit hook.
  */
 void
 dynamo_thread_stack_free_and_exit(byte *stack)
 {
     if (stack != NULL) {
         stack_free(stack, DYNAMORIO_STACK_SIZE);
         /* ASSUMPTION: if stack is NULL here, the exit was done earlier
          * (fixes case 6967)
          */
         EXITING_DR();
     }
 }

 #ifdef DR_APP_EXPORTS
 /* API routine to initialize DR */
 DR_APP_API int
 dr_app_setup(void)
 {
     /* FIXME: we either have to disallow the client calling this with
      * more than one thread running, or we have to suspend all the threads.
      * We should share the suspend-and-takeover loop (and for dr_app_setup_and_start
      * share the takeover portion) from dr_app_start().
      */
     int res;
     dcontext_t *dcontext;
     /* If this is a re-attach, .data might be read-only.
      * We'll re-protect at the end of dynamorio_app_init().
      */
     if (DATASEC_WRITABLE(DATASEC_RARELY_PROT) == 0)
         SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
     dr_api_entry = true;
     dynamo_control_via_attach = true;
     res = dynamorio_app_init();
     /* For dr_api_entry, we do not install all our signal handlers during init (to avoid
      * races: i#2335): we delay until dr_app_start().  Plus the vsyscall hook is
      * not set up until we find out the syscall method.  Thus we're already
      * "os_process_not_under_dynamorio".
      * We can't as easily avoid initializing the thread TLS and then dropping
      * it, however, as parts of init assume we have TLS.
      */
     dcontext = get_thread_private_dcontext();
     dynamo_thread_not_under_dynamo(dcontext);
     return res;
 }

 /* API routine to exit DR */
 DR_APP_API int
 dr_app_cleanup(void)
 {
     thread_record_t *tr;
     SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
     dr_api_exit = true;
     SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT); /* to keep properly nested */

     /* XXX: The dynamo_thread_[not_]under_dynamo() routines are not idempotent,
      * and must be balanced!  On Linux, they track the shared itimer refcount,
      * so a mismatch will lead to a refleak or negative refcount.
      * dynamorio_app_exit() will call dynamo_thread_not_under_dynamo(), so we
      * must ensure that we are under DR before calling it.  Therefore, we
      * require that the caller call dr_app_stop() before calling
      * dr_app_cleanup().  However, we cannot make a usage assertion to that
      * effect without addressing the FIXME comments in
      * dynamo_thread_not_under_dynamo() about updating tr->under_dynamo_control.
      */
     tr = thread_lookup(d_r_get_thread_id());
     if (tr != NULL && tr->dcontext != NULL) {
         os_process_under_dynamorio_initiate(tr->dcontext);
         os_process_under_dynamorio_complete(tr->dcontext);
         dynamo_thread_under_dynamo(tr->dcontext);
     }
     return dynamorio_app_exit();
 }

 /* Called by dr_app_start in arch-specific assembly file */
 void
 dr_app_start_helper(priv_mcontext_t *mc)
 {
     apicheck(dynamo_initialized, PRODUCT_NAME " not initialized");
     LOG(GLOBAL, LOG_TOP, 1, "dr_app_start in thread " TIDFMT "\n", d_r_get_thread_id());
     LOG(THREAD_GET, LOG_TOP, 1, "dr_app_start\n");

     if (!INTERNAL_OPTION(nullcalls)) {
         /* Adjust the app stack to account for the return address + alignment.
          * See dr_app_start in x86.asm.
          */
         mc->xsp += DYNAMO_START_XSP_ADJUST;
         dynamo_start(mc);
         /* the interpreter takes over from here */
     }
 }

 /* Dummy routine that returns control to the app if it is currently
  * under dynamo control.
  * NOINLINE because dr_app_stop is a stopping point.
  */
 DR_APP_API NOINLINE void
 dr_app_stop(void)
 {
     /* the application regains control in here */
 }

 /* NOINLINE because dr_app_stop_and_cleanup is a stopping point. */
 DR_APP_API NOINLINE void
 dr_app_stop_and_cleanup(void)
 {
     dr_app_stop_and_cleanup_with_stats(NULL);
 }

 /* NOINLINE because dr_app_stop_and_cleanup_with_stats is a stopping point. */
 DR_APP_API NOINLINE void
 dr_app_stop_and_cleanup_with_stats(dr_stats_t *drstats)
 {
     /* XXX i#95: today this is a full detach, while a separated dr_app_cleanup()
      * is not.  We should try and have dr_app_cleanup() take this detach path
      * here (and then we can simplify exit_synch_state()) but it's more complicated
      * and we need to resolve the unbounded dr_app_stop() time.
      */
     if (dynamo_initialized && !dynamo_exited && !doing_detach) {
 #    ifdef WINDOWS
         /* dynamo_thread_exit_common will later swap to app. */
         swap_peb_pointer(get_thread_private_dcontext(), true /*to priv*/);
 #    endif
         detach_on_permanent_stack(true /*internal*/, true /*do cleanup*/, drstats);
     }
     /* the application regains control in here */
 }

 DR_APP_API int
 dr_app_setup_and_start(void)
 {
     int r = dr_app_setup();
     if (r == SUCCESS)
         dr_app_start();
     return r;
 }
 #endif

 /* For use by threads that start and stop whether dynamo controls them.
  */
 void
 dynamo_thread_under_dynamo(dcontext_t *dcontext)
 {
     LOG(THREAD, LOG_ASYNCH, 2, "thread %d under DR control\n", dcontext->owning_thread);
     ASSERT(dcontext != NULL);
     /* FIXME: mark under_dynamo_control?
      * see comments in not routine below
      */
     os_thread_under_dynamo(dcontext);
 #ifdef SIDELINE
     if (dynamo_options.sideline) {
         /* wake up sideline thread -- ok to call if thread already awake */
         sideline_start();
     }
 #endif
     dcontext->currently_stopped = false;
     dcontext->go_native = false;
 }

 /* For use by threads that start and stop whether dynamo controls them.
  * This must be called by the owner of dcontext and not another
  * non-executing thread.
  */
 void
 dynamo_thread_not_under_dynamo(dcontext_t *dcontext)
 {
     ASSERT_MESSAGE(CHKLVL_ASSERTS + 1 /*expensive*/, "can only act on executing thread",
                    dcontext == get_thread_private_dcontext());
     if (dcontext == NULL)
         return;
     LOG(THREAD, LOG_ASYNCH, 2, "thread %d not under DR control\n",
         dcontext->owning_thread);
     dcontext->currently_stopped = true;
     os_thread_not_under_dynamo(dcontext);
 #ifdef SIDELINE
     /* FIXME: if # active threads is 0, then put sideline thread to sleep! */
     if (dynamo_options.sideline) {
         /* put sideline thread to sleep */
         sideline_stop();
     }
 #endif
 #ifdef DEBUG
     os_flush(dcontext->logfile);
 #endif
 }

 /* Mark this thread as under DR, and take over other threads in the current process.
  */
 void
 dynamorio_take_over_threads(dcontext_t *dcontext)
 {
     /* We repeatedly check if there are other threads in the process, since
      * while we're checking one may be spawning additional threads.
      */
     bool found_threads;
     uint attempts = 0;
     uint max_takeover_attempts = DYNAMO_OPTION(takeover_attempts);

     os_process_under_dynamorio_initiate(dcontext);
     /* We can start this thread now that we've set up process-wide actions such
      * as handling signals.
      */
     dynamo_thread_under_dynamo(dcontext);
     signal_event(dr_app_started);
     SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
     dynamo_started = true;
     /* Similarly, with our signal handler back in place, we remove the TLS limit. */
     detacher_tid = INVALID_THREAD_ID;
     SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
     /* XXX i#1305: we should suspend all the other threads for DR init to
      * satisfy the parts of the init process that assume there are no races.
      */
     do {
         found_threads = os_take_over_all_unknown_threads(dcontext);
         attempts++;
         if (found_threads && !bb_lock_start)
             bb_lock_start = true;
         if (DYNAMO_OPTION(sleep_between_takeovers))
             os_thread_sleep(1);
     } while (found_threads && attempts < max_takeover_attempts);
     os_process_under_dynamorio_complete(dcontext);

     instrument_post_attach_event();

     /* End the barrier to new threads. */
     signal_event(dr_attach_finished);

     if (found_threads) {
         REPORT_FATAL_ERROR_AND_EXIT(FAILED_TO_TAKE_OVER_THREADS, 2,
                                     get_application_name(), get_application_pid());
     }
     char buf[16];
     int num_threads = d_r_get_num_threads();
     if (num_threads > 1) { /* avoid for early injection */
         snprintf(buf, BUFFER_SIZE_ELEMENTS(buf), "%d", num_threads);
         NULL_TERMINATE_BUFFER(buf);
         SYSLOG(SYSLOG_INFORMATION, INFO_ATTACHED, 3, buf, get_application_name(),
                get_application_pid());
     }
 }

 /* Called by dynamorio_app_take_over in arch-specific assembly file */
 void
 dynamorio_app_take_over_helper(priv_mcontext_t *mc)
 {
     static bool have_taken_over = false; /* ASSUMPTION: not an actual write */
     SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
     APP_EXPORT_ASSERT(dynamo_initialized, PRODUCT_NAME " not initialized");
 #ifdef RETURN_AFTER_CALL
     /* FIXME : this is set after dynamo_initialized, so a slight race with
      * an injected thread turning on .C protection before the main thread
      * sets this. */
     dr_preinjected = true; /* currently only relevant on Win32 */
 #endif
     LOG(GLOBAL, LOG_TOP, 1, "taking over via preinject in %s\n", __FUNCTION__);

     if (!INTERNAL_OPTION(nullcalls) && !have_taken_over) {
         have_taken_over = true;
         LOG(GLOBAL, LOG_TOP, 1, "dynamorio_app_take_over\n");
         /* set this flag to indicate that we should run until the program dies: */
         automatic_startup = true;

         if (DYNAMO_OPTION(inject_primary))
             take_over_primary_thread();

         /* who knows when this was called -- no guarantee we control all threads --
          * unless we were auto-injected (preinject library calls this routine)
          */
         control_all_threads = automatic_startup;
         SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);

         if (IF_WINDOWS_ELSE(!dr_earliest_injected && !dr_early_injected, true)) {
             /* Adjust the app stack to account for the return address + alignment.
              * See dynamorio_app_take_over in x86.asm.
              */
             mc->xsp += DYNAMO_START_XSP_ADJUST;
         }

         /* For hotp_only and thin_client, the app should run native, except
          * for our hooks.
          * This is where apps hooked using appinit key are let go native.
          * Even though control is going to native app code, we want
          * automatic_startup and control_all_threads set.
          */
         if (!RUNNING_WITHOUT_CODE_CACHE())
             dynamo_start(mc);
         /* the interpreter takes over from here */
     } else
         SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
 }

 #ifdef WINDOWS
 extern app_pc parent_early_inject_address; /* from os.c */

 /* in arch-specific assembly file */
 void
 dynamorio_app_take_over(void);

 DYNAMORIO_EXPORT void
 dynamorio_app_init_and_early_takeover(uint inject_location, void *restore_code)
 {
     int res;
     ASSERT(!dynamo_initialized && !dynamo_exited);
     /* This routine combines dynamorio_app_init() and dynamrio_app_takeover into
      * a single routine that also handles any early injection cleanup needed. */
     ASSERT_NOT_IMPLEMENTED(inject_location != INJECT_LOCATION_KiUserApc);
     /* currently only Ldr* hook points are known to work */
     ASSERT_CURIOSITY(INJECT_LOCATION_IS_LDR(inject_location));
     /* See notes in os.c DLLMain. When early injected we are unable to find
      * the address of LdrpLoadDll so we use the parent's value which is passed
      * to us at the start of restore_code. FIXME - if we start using multiple
      * inject locations we'll probably have to ensure we always pass this.
      */
     if (INJECT_LOCATION_IS_LDR(inject_location)) {
         parent_early_inject_address = *(app_pc *)restore_code;
     }
     dr_early_injected = true;
     dr_early_injected_location = inject_location;
     res = dynamorio_app_init();
     ASSERT(res == SUCCESS);
     ASSERT(dynamo_initialized && !dynamo_exited);
     LOG(GLOBAL, LOG_TOP, 1, "taking over via early injection in %s\n", __FUNCTION__);
     /* FIXME - restore code needs to be freed, but we have to return through it
      * first... could instead duplicate its tail here if we wrap this
      * routine in asm or eqv. pass the continuation state in as args. */
     ASSERT(inject_location != INJECT_LOCATION_KiUserApc);
     dynamorio_app_take_over();
 }

 /* Called with DR library mapped in but without its imports processed.
  */
 void
 dynamorio_earliest_init_takeover_C(byte *arg_ptr, priv_mcontext_t *mc)
 {
     int res;
     bool earliest_inject;

     /* Windows-specific code for the most part */
     earliest_inject = earliest_inject_init(arg_ptr);

     /* Initialize now that DR dll imports are hooked up */
     if (earliest_inject) {
         dr_earliest_injected = true;
         dr_earliest_inject_args = arg_ptr;
     } else
         dr_early_injected = true;
     res = dynamorio_app_init();
     ASSERT(res == SUCCESS);
     ASSERT(dynamo_initialized && !dynamo_exited);
     LOG(GLOBAL, LOG_TOP, 1, "taking over via earliest injection in %s\n", __FUNCTION__);

     /* earliest_inject_cleanup() is called within dynamorio_app_init() to avoid
      * confusing the exec areas scan
      */

     dynamorio_app_take_over_helper(mc);
 }
 #endif /* WINDOWS */

 /***************************************************************************
  * SELF-PROTECTION
  */

 /* FIXME: even with -single_privileged_thread, we aren't fully protected,
  * because there's a window between us resuming the other threads and
  * returning to our caller where another thread could clobber our return
  * address or something.
  */
 static void
 dynamorio_protect(void)
 {
     ASSERT(SELF_PROTECT_ON_CXT_SWITCH);
     LOG(GLOBAL, LOG_DISPATCH, 4, "dynamorio_protect thread=" TIDFMT "\n",
         d_r_get_thread_id());
     /* we don't protect local heap here, that's done lazily */

     d_r_mutex_lock(&protect_info->lock);
     ASSERT(protect_info->num_threads_unprot > 0);
     /* FIXME: nice to also catch double enters but would need to track more info */
     if (protect_info->num_threads_unprot <= 0) {
         /* Defensive code to prevent crashes from double exits (the theory
          * for case 7631/8030).  However, this precludes an extra exit+enter
          * pair from working properly (though an extra enter+exit will continue
          * to work), though such a pair would have crashed if another thread
          * had entered in the interim anyway.
          */
         protect_info->num_threads_unprot = 0;
         d_r_mutex_unlock(&protect_info->lock);
         return;
     }
     protect_info->num_threads_unprot--;
     if (protect_info->num_threads_unprot > 0) {
         /* other threads still in DR, cannot protect global memory */
         LOG(GLOBAL, LOG_DISPATCH, 4, "dynamorio_protect: not last thread => nop\n");
         d_r_mutex_unlock(&protect_info->lock);
         return;
     }

     SELF_PROTECT_GLOBAL(READONLY);

     if (INTERNAL_OPTION(single_privileged_thread)) {
         /* FIXME: want to resume threads and allow thread creation only
          * _after_ protect data segment, but lock is in data segment!
          */
         if (protect_info->num_threads_suspended > 0) {
             thread_record_t *tr;
             int i, num = 0;
             /* we do not need to grab the all_threads_lock because
              * no threads can be added or removed so who cares if we
              * access the data structure simultaneously with another
              * reader of it
              */
             for (i = 0; i < HASHTABLE_SIZE(ALL_THREADS_HASH_BITS); i++) {
                 for (tr = all_threads[i]; tr; tr = tr->next) {
                     if (tr->under_dynamo_control) {
                         os_thread_resume(all_threads[i]);
                         num++;
                     }
                 }
             }
             ASSERT(num == protect_info->num_threads_suspended);
             protect_info->num_threads_suspended = 0;
         }

         /* thread init/exit can proceed now */
         d_r_mutex_unlock(&thread_initexit_lock);
     }

     /* FIXME case 8073: temporary until we put in unprots in the
      * right places.  if we were to leave this here we'd want to combine
      * .fspdata and .cspdata for more efficient prot changes.
      */
     SELF_PROTECT_DATASEC(DATASEC_FREQ_PROT);
     SELF_PROTECT_DATASEC(DATASEC_CXTSW_PROT);

     d_r_mutex_unlock(&protect_info->lock);
 }

 static void
 dynamorio_unprotect(void)
 {
     ASSERT(SELF_PROTECT_ON_CXT_SWITCH);

     d_r_mutex_lock(
         &protect_info->lock); /* lock in unprot heap, not data segment, so safe! */
     protect_info->num_threads_unprot++;
     if (protect_info->num_threads_unprot == 1) {
         /* was protected, so we need to do the unprotection */
         SELF_UNPROTECT_DATASEC(DATASEC_CXTSW_PROT);
         /* FIXME case 8073: temporary until we put in unprots in the
          * right places.  if we were to leave this here we'd want to combine
          * .fspdata and .cspdata for more efficient prot changes.
          */
         SELF_UNPROTECT_DATASEC(DATASEC_FREQ_PROT);

         if (INTERNAL_OPTION(single_privileged_thread)) {
             /* FIXME: want to suspend all other threads _before_ unprotecting anything,
              * but need to guarantee no new threads while we're suspending them,
              * and can't do that without setting a lock => need data segment!
              */
             d_r_mutex_lock(&thread_initexit_lock);

             if (d_r_get_num_threads() > 1) {
                 thread_record_t *tr;
                 int i;
                 /* current multiple-thread solution: suspend all other threads! */
                 ASSERT(protect_info->num_threads_suspended == 0);
                 /* we do not need to grab the all_threads_lock because
                  * no threads can be added or removed so who cares if we
                  * access the data structure simultaneously with another
                  * reader of it
                  */
                 for (i = 0; i < HASHTABLE_SIZE(ALL_THREADS_HASH_BITS); i++) {
                     for (tr = all_threads[i]; tr; tr = tr->next) {
                         if (tr->under_dynamo_control) {
                             DEBUG_DECLARE(bool ok =)
                             os_thread_suspend(all_threads[i]);
                             ASSERT(ok);
                             protect_info->num_threads_suspended++;
                         }
                     }
                 }
             }
             /* we don't unlock or resume threads until we re-enter cache */
         }

         SELF_PROTECT_GLOBAL(WRITABLE);
     }
     /* we don't re-protect local heap here, that's done at points where
      * it was protected lazily
      */
     d_r_mutex_unlock(&protect_info->lock);
     LOG(GLOBAL, LOG_DISPATCH, 4, "dynamorio_unprotect thread=" TIDFMT "\n",
         d_r_get_thread_id());
 }

 #ifdef DEBUG
 const char *
 get_data_section_name(app_pc pc)
 {
     uint i;
     for (i = 0; i < DATASEC_NUM; i++) {
         if (pc >= datasec_start[i] && pc < datasec_end[i])
             return DATASEC_NAMES[i];
     }
     return NULL;
 }

 bool
 check_should_be_protected(uint sec)
 {
     /* Blindly asserting that a data section is protected is racy as
      * another thread could be in an unprot window.  We use some
      * heuristics to try and identify bugs where a section is left
      * unprot, but it's not easy.
      */
     if (/* case 8107: for INJECT_LOCATION_LdrpLoadImportModule we
          * load a helper library and end up in d_r_dispatch() for
          * syscall_while_native before DR is initialized.
          */
         !dynamo_initialized ||
 #    ifdef WINDOWS
         /* case 8113: detach currently unprots .data prior to its
          * thread synch, so don't count anything after that
          */
         doing_detach ||
 #    endif
         !TEST(DATASEC_SELFPROT[sec], DYNAMO_OPTION(protect_mask)) ||
         DATASEC_PROTECTED(sec))
         return true;
     STATS_INC(datasec_not_prot);
     /* FIXME: even checking d_r_get_num_threads()==1 is still racy as a thread could
      * exit, and it's not worth grabbing thread_initexit_lock here..
      */
     if (threads_ever_count == 1
 #    ifdef DR_APP_EXPORTS
         /* For start/stop, can be other threads running around so we bail on
          * perfect protection
          */
         && !dr_api_entry
 #    endif
     )
         return false;
     /* FIXME: no count of threads in DR or anything so can't conclude much
      * Just return true and hope developer looks at datasec_not_prot stats.
      * We do have an ASSERT_CURIOSITY on the stat in data_section_exit().
      */
     return true;
 }

 #    ifdef WINDOWS
 /* Assumed to only be called about DR dll writable regions */
 bool
 data_sections_enclose_region(app_pc start, app_pc end)
 {
     /* Rather than solve the general enclose problem by sorting,
      * we subtract each piece we find.
      * It used to be that on 32-bit .data|.fspdata|.cspdata|.nspdata formed
      * the only writable region, with .pdata between .data and .fspdata on 64.
      * But building with VS2012, I'm seeing the sections in other orders (i#1075).
      * And with x64 reachability we moved the interception buffer in .data,
      * and marking it +rx results in sub-section calls to here.
      */
     int i;
     bool found_start = false, found_end = false;
     ssize_t sz = end - start;
     for (i = 0; i < DATASEC_NUM; i++) {
         if (datasec_start[i] <= end && datasec_end[i] >= start) {
             byte *overlap_start = MAX(datasec_start[i], start);
             byte *overlap_end = MIN(datasec_end[i], end);
             sz -= overlap_end - overlap_start;
         }
     }
     return sz == 0;
 }
 #    endif /* WINDOWS */
 #endif     /* DEBUG */

 static void
 get_data_section_bounds(uint sec)
 {
     /* FIXME: on linux we should include .got and .dynamic in one of our
      * sections, requiring specifying the order of sections (case 3789)!
      * Should use an ld script to ensure that .nspdata is last, or find a unique
      * attribute to force separation (perhaps mark as rwx, then
      * remove the x at init time?)  ld 2.15 puts it at the end, but
      * ld 2.13 puts .got and .dynamic after it!  For now we simply
      * don't protect subsequent guys.
      * On win32 there are no other rw sections, fortunately.
      */
     ASSERT(sec >= 0 && sec < DATASEC_NUM);
     /* for DEBUG we use for data_sections_enclose_region() */
     ASSERT(IF_WINDOWS(IF_DEBUG(true ||))
                TEST(DATASEC_SELFPROT[sec], dynamo_options.protect_mask));
     d_r_mutex_lock(&datasec_lock[sec]);
     ASSERT(datasec_start[sec] == NULL);
     get_named_section_bounds(get_dynamorio_dll_start(), DATASEC_NAMES[sec],
                              &datasec_start[sec], &datasec_end[sec]);
     d_r_mutex_unlock(&datasec_lock[sec]);
     ASSERT(ALIGNED(datasec_start[sec], PAGE_SIZE));
     ASSERT(ALIGNED(datasec_end[sec], PAGE_SIZE));
     ASSERT(datasec_start[sec] < datasec_end[sec]);
 #ifdef WINDOWS
     if (IF_DEBUG(true ||) TEST(DATASEC_SELFPROT[sec], dynamo_options.protect_mask))
         merge_writecopy_pages(datasec_start[sec], datasec_end[sec]);
 #endif
 }

 #ifdef UNIX
 /* We get into problems if we keep a .section open across string literals, etc.
  * (such as when wrapping a function to get its local-scope statics in that section),
  * but the VAR_IN_SECTION does the real work for us, just so long as we have one
  * .section decl somewhere.
  */
 DECLARE_DATA_SECTION(RARELY_PROTECTED_SECTION, "w")
 DECLARE_DATA_SECTION(FREQ_PROTECTED_SECTION, "w")
 DECLARE_DATA_SECTION(NEVER_PROTECTED_SECTION, "w")
 END_DATA_SECTION_DECLARATIONS()
 #endif

 static void
 data_section_init(void)
 {
     uint i;
     for (i = 0; i < DATASEC_NUM; i++) {
         if (datasec_start[i] != NULL) {
             /* We were called early due to an early syslog.
              * We still retain our slightly later normal init position so we can
              * log, etc. in normal runs.
              */
             return;
         }
         ASSIGN_INIT_LOCK_FREE(datasec_lock[i], datasec_selfprot_lock);
         /* for DEBUG we use for data_sections_enclose_region() */
         if (IF_WINDOWS(IF_DEBUG(true ||))
                 TEST(DATASEC_SELFPROT[i], dynamo_options.protect_mask)) {
             get_data_section_bounds(i);
         }
     }
     DOCHECK(1, {
         /* ensure no overlaps */
         uint j;
         for (i = 0; i < DATASEC_NUM; i++) {
             for (j = i + 1; j < DATASEC_NUM; j++) {
                 ASSERT(datasec_start[i] >= datasec_end[j] ||
                        datasec_start[j] >= datasec_end[i]);
             }
         }
     });
 }

 static void
 data_section_exit(void)
 {
     uint i;
     DOSTATS({
         /* There can't have been that many races.
          * A failure to re-protect should result in a ton of d_r_dispatch
          * entrances w/ .data unprot, so should show up here.
          * However, an app with threads that are initializing in DR and thus
          * unprotected .data while other threads are running new code (such as
          * on attach) can easily rack up hundreds of unprot cache entrances.
          */
         ASSERT_CURIOSITY(GLOBAL_STAT(datasec_not_prot) < 5000);
     });
     for (i = 0; i < DATASEC_NUM; i++)
         DELETE_LOCK(datasec_lock[i]);
 }

 #define DATASEC_WRITABLE_MOD(which, op)                 \
     ((which) == DATASEC_RARELY_PROT                     \
          ? (datasec_writable_rareprot op)               \
          : ((which) == DATASEC_CXTSW_PROT               \
                 ? (datasec_writable_cxtswprot op)       \
                 : ((which) == DATASEC_FREQ_PROT         \
                        ? (datasec_writable_freqprot op) \
                        : (ASSERT_NOT_REACHED(), datasec_writable_neverprot))))

 /* WARNING: any DO_ONCE will call this routine, so don't call anything here
  * that has a DO_ONCE, to avoid deadlock!
  */
 void
 protect_data_section(uint sec, bool writable)
 {
     ASSERT(sec >= 0 && sec < DATASEC_NUM);
     ASSERT(TEST(DATASEC_SELFPROT[sec], dynamo_options.protect_mask));
     /* We can be called very early before data_section_init() so init here
      * (data_section_init() has no dependences).
      */
     if (datasec_start[sec] == NULL) {
         /* should only happen early in init */
         ASSERT(!dynamo_initialized);
         data_section_init();
     }
     d_r_mutex_lock(&datasec_lock[sec]);
     ASSERT(datasec_start[sec] != NULL);
     /* if using libc, we cannot print while data segment is read-only!
      * thus, if making it writable, do that first, otherwise do it last.
      * w/ ntdll this is not a problem.
      */
     /* Remember that multiple threads can be doing (unprotect,protect) pairs of
      * calls simultaneously.  The datasec_lock makes each individual call atomic,
      * and if all calls are properly nested, our use of counters should result in
      * the proper protection only after the final protect call and not in the
      * middle of some other thread's writes to the data section.
      */
     if (writable) {
         /* On-context-switch protection has a separate mechanism for
          * only protecting when the final thread leaves DR
          */
         ASSERT_CURIOSITY(DATASEC_WRITABLE(sec) <= 2); /* shouldn't nest too deep! */
         if (DATASEC_WRITABLE(sec) == 0) {
             make_writable(datasec_start[sec], datasec_end[sec] - datasec_start[sec]);
             STATS_INC(datasec_prot_changes);
         } else
             STATS_INC(datasec_prot_wasted_calls);
         (void)DATASEC_WRITABLE_MOD(sec, ++);
     }
     LOG(TEST(DATASEC_SELFPROT[sec], SELFPROT_ON_CXT_SWITCH) ? THREAD_GET : GLOBAL,
         LOG_VMAREAS, TEST(DATASEC_SELFPROT[sec], SELFPROT_ON_CXT_SWITCH) ? 3U : 2U,
         "protect_data_section: thread " TIDFMT " %s (recur %d, stat %d) %s %s %d\n",
         d_r_get_thread_id(), DATASEC_WRITABLE(sec) == 1 ? "changing" : "nop",
         DATASEC_WRITABLE(sec), GLOBAL_STAT(datasec_not_prot), DATASEC_NAMES[sec],
         writable ? "rw" : "r", DATASEC_WRITABLE(sec));
     if (!writable) {
         ASSERT(DATASEC_WRITABLE(sec) > 0);
         (void)DATASEC_WRITABLE_MOD(sec, --);
         if (DATASEC_WRITABLE(sec) == 0) {
             make_unwritable(datasec_start[sec], datasec_end[sec] - datasec_start[sec]);
             STATS_INC(datasec_prot_changes);
         } else
             STATS_INC(datasec_prot_wasted_calls);
     }
     d_r_mutex_unlock(&datasec_lock[sec]);
 }

 /* enter/exit DR hooks */
 void
 entering_dynamorio(void)
 {
     if (SELF_PROTECT_ON_CXT_SWITCH)
         dynamorio_unprotect();
     ASSERT(HOOK_ENABLED);
     LOG(GLOBAL, LOG_DISPATCH, 3, "entering_dynamorio thread=" TIDFMT "\n",
         d_r_get_thread_id());
     STATS_INC(num_entering_DR);
     if (INTERNAL_OPTION(single_thread_in_DR)) {
         acquire_recursive_lock(&thread_in_DR_exclusion);
         LOG(GLOBAL, LOG_DISPATCH, 3, "entering_dynamorio thread=" TIDFMT " count=%d\n",
             d_r_get_thread_id(), thread_in_DR_exclusion.count);
     }
 }

 void
 exiting_dynamorio(void)
 {
     ASSERT(HOOK_ENABLED);
     LOG(GLOBAL, LOG_DISPATCH, 3, "exiting_dynamorio thread=" TIDFMT "\n",
         d_r_get_thread_id());
     STATS_INC(num_exiting_DR);
     if (INTERNAL_OPTION(single_thread_in_DR)) {
         /* thread init/exit can proceed now */
         LOG(GLOBAL, LOG_DISPATCH, 3, "exiting_dynamorio thread=" TIDFMT " count=%d\n",
             d_r_get_thread_id(), thread_in_DR_exclusion.count - 1);
         release_recursive_lock(&thread_in_DR_exclusion);
     }
     if (SELF_PROTECT_ON_CXT_SWITCH && !dynamo_exited)
         dynamorio_protect();
 }

 /* Note this includes any stack guard pages */
 bool
 is_on_initstack(byte *esp)
 {
     return (esp <= d_r_initstack && esp > d_r_initstack - DYNAMORIO_STACK_SIZE);
 }

 /* Note this includes any stack guard pages */
 bool
 is_on_dstack(dcontext_t *dcontext, byte *esp)
 {
     return (esp <= dcontext->dstack && esp > dcontext->dstack - DYNAMORIO_STACK_SIZE);
 }

 bool
 is_currently_on_dstack(dcontext_t *dcontext)
 {
     byte *cur_esp;
     GET_STACK_PTR(cur_esp);
     return is_on_dstack(dcontext, cur_esp);
 }

 void
 pre_second_thread(void)
 {
     /* i#1111: nop-out bb_building_lock until 2nd thread created.
      * While normally we'll call this in the primary thread while not holding
      * the lock, it's possible on Windows for an externally injected thread
      * (or for a thread sneakily created by some native_exec code w/o going
      * through ntdll wrappers) to appear.  We solve the problem of the main
      * thread currently holding bb_building_lock and us turning its
      * unlock into an error by the bb_lock_would_have bool in
      * SHARED_BB_UNLOCK().
      */
     if (!bb_lock_start) {
         d_r_mutex_lock(&bb_building_lock);
         SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
         bb_lock_start = true;
         SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
         d_r_mutex_unlock(&bb_building_lock);
     }
 }