core/unix/os.c - external/dynamorio - Git at Google

 /* *******************************************************************************
  * Copyright (c) 2010-2014 Google, Inc.  All rights reserved.
  * Copyright (c) 2011 Massachusetts Institute of Technology  All rights reserved.
  * Copyright (c) 2000-2010 VMware, Inc.  All rights reserved.
  * *******************************************************************************/

 /*
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * * Redistributions of source code must retain the above copyright notice,
  *   this list of conditions and the following disclaimer.
  *
  * * Redistributions in binary form must reproduce the above copyright notice,
  *   this list of conditions and the following disclaimer in the documentation
  *   and/or other materials provided with the distribution.
  *
  * * Neither the name of VMware, Inc. nor the names of its contributors may be
  *   used to endorse or promote products derived from this software without
  *   specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */

 /* Copyright (c) 2003-2007 Determina Corp. */
 /* Copyright (c) 2001-2003 Massachusetts Institute of Technology */
 /* Copyright (c) 2000-2001 Hewlett-Packard Company */

 /*
  * os.c - Linux specific routines
  */

 /* Easiest to match kernel stat struct by using 64-bit.
  * This limits us to 2.4+ kernel but that's ok.
  * I don't really want to get into requiring kernel headers to build
  * general release packages, though that would be fine for targeted builds.
  * There are 3 different stat syscalls (SYS_oldstat, SYS_stat, and SYS_stat64)
  * and using _LARGEFILE64_SOURCE with SYS_stat64 is the best match.
  */
 #define _LARGEFILE64_SOURCE
 /* for mmap-related #defines */
 #include <sys/types.h>
 #include <sys/mman.h>
 /* in case MAP_32BIT is missing */
 #ifndef MAP_32BIT
 # define MAP_32BIT 0x40
 #endif
 #ifndef MAP_ANONYMOUS
 # define MAP_ANONYMOUS MAP_ANON /* MAP_ANON on Mac */
 #endif
 /* for open */
 #include <sys/stat.h>
 #include <fcntl.h>
 #include "../globals.h"
 #include "../hashtable.h"
 #include <string.h>
 #include <unistd.h> /* for write and usleep and _exit */
 #include <limits.h>

 #ifdef MACOS
 # include <sys/sysctl.h>         /* for sysctl */
 # ifndef SYS___sysctl
 /* The name was changed on Yosemite */
 #  define SYS___sysctl SYS_sysctl
 # endif
 # include <mach/mach_traps.h>    /* for swtch_pri */
 # include "include/syscall_mach.h"
 #endif

 #ifdef LINUX
 # include <sys/vfs.h> /* for statfs */
 #elif defined(MACOS)
 # include <sys/mount.h> /* for statfs */
 # include <mach/mach.h>
 # include <mach/task.h>
 # include <mach/semaphore.h>
 # include <mach/sync_policy.h>
 #endif

 #include <dirent.h>

 /* for getrlimit */
 #include <sys/time.h>
 #include <sys/resource.h>

 #ifdef LINUX
 /* For clone and its flags, the manpage says to include sched.h with _GNU_SOURCE
  * defined.  _GNU_SOURCE brings in unwanted extensions and causes name
  * conflicts.  Instead, we include unix/sched.h which comes from the Linux
  * kernel headers.
  */
 # include <linux/sched.h>
 #endif

 #include "module.h" /* elf */
 #include "tls.h"

 #ifndef F_DUPFD_CLOEXEC /* in linux 2.6.24+ */
 # define F_DUPFD_CLOEXEC 1030
 #endif

 /* Cross arch syscall nums for use with struct stat64. */
 #ifdef X64
 # define SYSNUM_STAT SYS_stat
 # define SYSNUM_FSTAT SYS_fstat
 #else
 # define SYSNUM_STAT SYS_stat64
 # define SYSNUM_FSTAT SYS_fstat64
 #endif

 #ifdef MACOS
 # define SYSNUM_EXIT_PROCESS SYS_exit
 # define SYSNUM_EXIT_THREAD SYS_bsdthread_terminate
 #else
 # define SYSNUM_EXIT_PROCESS SYS_exit_group
 # define SYSNUM_EXIT_THREAD SYS_exit
 #endif

 /* This is not always sufficient to identify a syscall return value.
  * For example, MacOS has some 32-bit syscalls that return 64-bit
  * values in xdx:xax.
  */
 #define MCXT_SYSCALL_RES(mc) ((mc)->IF_X86_ELSE(xax, r0))
 #ifdef ARM
 # ifdef X64
 #  define ASM_R3 "x3"
 #  define READ_TP_TO_R3  "mrs  "ASM_R3", tpidrro_el0 \n\t" /* read TPIDRRO_EL0 */
 # else
 #  define ASM_R3 "r3"
 #  define READ_TP_TO_R3  "mrc  p15, 0, "ASM_R3", c13, c0, 2 \n\t" /* read TPIDRURW */
 # endif /* 64/32-bit */
 #endif /* ARM */

 /* Prototype for all functions in .init_array. */
 typedef int (*init_fn_t)(int argc, char **argv, char **envp);

 /* i#46: Private __environ pointer.  Points at the environment variable array
  * on the stack, which is different from what libc __environ may point at.  We
  * use the environment for following children and setting options, so its OK
  * that we don't see what libc says.
  */
 char **our_environ;
 #include <errno.h>
 /* avoid problems with use of errno as var name in rest of file */
 #undef errno
 /* we define __set_errno below */

 /* must be prior to <link.h> => <elf.h> => INT*_{MIN,MAX} */
 # include "instr.h" /* for get_app_segment_base() */

 #include "decode_fast.h" /* decode_cti: maybe os_handle_mov_seg should be ifdef X86? */

 #include <dlfcn.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <signal.h>
 #include <syslog.h>             /* vsyslog */
 #include "../vmareas.h"
 #ifdef RCT_IND_BRANCH
 # include "../rct.h"
 #endif
 #ifdef LINUX
 # include "include/syscall.h"            /* our own local copy */
 #else
 # include <sys/syscall.h>
 #endif
 #include "../module_shared.h"
 #include "os_private.h"
 #include "../synch.h"
 #include "memquery.h"
 #include "ksynch.h"

 #ifndef HAVE_MEMINFO_QUERY
 # include "memcache.h"
 #endif

 #ifdef CLIENT_INTERFACE
 # include "instrument.h"
 #endif

 #ifdef NOT_DYNAMORIO_CORE_PROPER
 # undef ASSERT
 # undef ASSERT_NOT_IMPLEMENTED
 # undef ASSERT_NOT_TESTED
 # undef ASSERT_CURIOSITY
 # define ASSERT(x) /* nothing */
 # define ASSERT_NOT_IMPLEMENTED(x) /* nothing */
 # define ASSERT_NOT_TESTED(x) /* nothing */
 # define ASSERT_CURIOSITY(x) /* nothing */
 # undef LOG
 # undef DOSTATS
 # define LOG(...) /* nothing */
 # define DOSTATS(...) /* nothing */
 #else /* !NOT_DYNAMORIO_CORE_PROPER: around most of file, to exclude preload */

 /* Guards data written by os_set_app_thread_area(). */
 DECLARE_CXTSWPROT_VAR(static mutex_t set_thread_area_lock,
                       INIT_LOCK_FREE(set_thread_area_lock));

 #ifndef HAVE_TLS
 /* We use a table lookup to find a thread's dcontext */
 /* Our only current no-TLS target, VMKernel (VMX86_SERVER), doesn't have apps with
  * tons of threads anyway
  */
 #define MAX_THREADS 512
 typedef struct _tls_slot_t {
     thread_id_t tid;
     dcontext_t *dcontext;
 } tls_slot_t;
 /* Stored in heap for self-prot */
 static tls_slot_t *tls_table;
 /* not static so deadlock_avoidance_unlock() can look for it */
 DECLARE_CXTSWPROT_VAR(mutex_t tls_lock, INIT_LOCK_FREE(tls_lock));
 #endif

 #ifdef CLIENT_INTERFACE
 /* Should we place this in a client header?  Currently mentioned in
  * dr_raw_tls_calloc() docs.
  */
 static bool client_tls_allocated[MAX_NUM_CLIENT_TLS];
 DECLARE_CXTSWPROT_VAR(static mutex_t client_tls_lock, INIT_LOCK_FREE(client_tls_lock));
 #endif

 #include <stddef.h> /* for offsetof */

 #include <sys/utsname.h> /* for struct utsname */

 /* forward decl */
 static void handle_execve_post(dcontext_t *dcontext);
 static bool os_switch_lib_tls(dcontext_t *dcontext, bool to_app);
 static bool os_switch_seg_to_context(dcontext_t *dcontext, reg_id_t seg, bool to_app);
 #ifdef LINUX
 static bool handle_app_mremap(dcontext_t *dcontext, byte *base, size_t size,
                               byte *old_base, size_t old_size,
                               uint old_prot, uint old_type);
 static void handle_app_brk(dcontext_t *dcontext, byte *old_brk, byte *new_brk);
 #endif

 /* full path to our own library, used for execve */
 static char dynamorio_library_path[MAXIMUM_PATH];
 /* Issue 20: path to other architecture */
 static char dynamorio_alt_arch_path[MAXIMUM_PATH];
 /* Makefile passes us LIBDIR_X{86,64} defines */
 #define DR_LIBDIR_X86 STRINGIFY(LIBDIR_X86)
 #define DR_LIBDIR_X64 STRINGIFY(LIBDIR_X64)

 /* pc values delimiting dynamo dll image */
 static app_pc dynamo_dll_start = NULL;
 static app_pc dynamo_dll_end = NULL; /* open-ended */

 static app_pc executable_start = NULL;
 static app_pc executable_end = NULL;

 /* Used by get_application_name(). */
 static char executable_path[MAXIMUM_PATH];
 static char *executable_basename;

 /* does the kernel provide tids that must be used to distinguish threads in a group? */
 static bool kernel_thread_groups;

 static bool kernel_64bit;

 pid_t pid_cached;

 static bool fault_handling_initialized;

 #ifdef PROFILE_RDTSC
 uint kilo_hertz; /* cpu clock speed */
 #endif

 /* Xref PR 258731, dup of STDOUT/STDERR in case app wants to close them. */
 DR_API file_t our_stdout = STDOUT_FILENO;
 DR_API file_t our_stderr = STDERR_FILENO;
 DR_API file_t our_stdin = STDIN_FILENO;

 /* we steal fds from the app */
 static struct rlimit app_rlimit_nofile;

 /* we store all DR files so we can prevent the app from changing them,
  * and so we can close them in a child of fork.
  * the table key is the fd and the payload is the set of DR_FILE_* flags.
  */
 static generic_table_t *fd_table;
 #define INIT_HTABLE_SIZE_FD 6 /* should remain small */

 static bool
 is_readable_without_exception_internal(const byte *pc, size_t size, bool query_os);

 static void
 process_mmap(dcontext_t *dcontext, app_pc base, size_t size, uint prot,
              uint flags _IF_DEBUG(const char *map_type));

 #ifdef LINUX
 static char *
 read_proc_self_exe(bool ignore_cache);
 #endif

 /* Libc independent directory iterator, similar to readdir.  If we ever need
  * this on Windows we should generalize it and export it to clients.
  */
 typedef struct _dir_iterator_t {
     file_t fd;
     int off;
     int end;
     const char *name;            /* Name of the current entry. */
     char buf[4 * MAXIMUM_PATH];  /* Expect stack alloc, so not too big. */
 } dir_iterator_t;

 static void os_dir_iterator_start(dir_iterator_t *iter, file_t fd);
 static bool os_dir_iterator_next(dir_iterator_t *iter);
 /* XXX: If we generalize to Windows, will we need os_dir_iterator_stop()? */

 /* vsyscall page.  hardcoded at 0xffffe000 in earlier kernels, but
  * randomly placed since fedora2.
  * marked rx then: FIXME: should disallow this guy when that's the case!
  * random vsyscall page is identified in maps files as "[vdso]"
  * (kernel-provided fake shared library or Virt Dyn Shared Object)
  */
 app_pc vsyscall_page_start = NULL;
 /* pc of the end of the syscall instr itself */
 app_pc vsyscall_syscall_end_pc = NULL;
 /* pc where kernel returns control after sysenter vsyscall */
 app_pc vsyscall_sysenter_return_pc = NULL;
 #define VSYSCALL_PAGE_START_HARDCODED ((app_pc)(ptr_uint_t) 0xffffe000)
 #ifdef X64
 /* i#430, in Red Hat Enterprise Server 5.6, vysycall region is marked
  * not executable
  * ffffffffff600000-ffffffffffe00000 ---p 00000000 00:00 0  [vsyscall]
  */
 # define VSYSCALL_REGION_MAPS_NAME "[vsyscall]"
 #endif

 #if !defined(STANDALONE_UNIT_TEST) && !defined(STATIC_LIBRARY)
 /* The pthreads library keeps errno in its pthread_descr data structure,
  * which it looks up by dispatching on the stack pointer.  This doesn't work
  * when within dynamo.  Thus, we define our own __errno_location() for use both
  * by us and the app, to prevent pthreads looking at the stack pointer when
  * out of the code cache.
  */

  /* FIXME: maybe we should create 1st dcontext earlier so we don't need init_errno?
   * any problems with init_errno being set and then dcontext->errno being read?
   * FIXME: if a thread issues a dr_app_stop, then we don't want to use
   * this errno slot?  But it may later do a start...probably ok to keep using
   * the slot.  But, when threads die, they'll all use the same init_errno!
   */
 static int init_errno; /* errno until 1st dcontext created */

 int *
 __errno_location(void) {
     /* Each dynamo thread should have a separate errno */
     dcontext_t *dcontext = get_thread_private_dcontext();
     if (dcontext == NULL)
         return &init_errno;
     else {
         /* WARNING: init_errno is in data segment so can be RO! */
         return &(dcontext->upcontext_ptr->errno);
     }
 }
 #endif /* !STANDALONE_UNIT_TEST && !STATIC_LIBRARY */

 #if defined(HAVE_TLS) && defined(CLIENT_INTERFACE)
 /* i#598
  * (gdb) x/20i (*(errno_loc_t)0xf721e413)
  * 0xf721e413 <__errno_location>:       push   %ebp
  * 0xf721e414 <__errno_location+1>:     mov    %esp,%ebp
  * 0xf721e416 <__errno_location+3>:     call   <__x86.get_pc_thunk.cx>
  * 0xf721e41b <__errno_location+8>:     add    $0x166bd9,%ecx
  * 0xf721e421 <__errno_location+14>:    mov    -0x1c(%ecx),%eax
  * 0xf721e427 <__errno_location+20>:    add    %gs:0x0,%eax
  * 0xf721e42e <__errno_location+27>:    pop    %ebp
  * 0xf721e42f <__errno_location+28>:    ret
  *
  * __errno_location calcuates the errno location by adding
  * TLS's base with errno's offset in TLS.
  * However, because the TLS has been switched in os_tls_init,
  * the calculated address is wrong.
  * We first get the errno offset in TLS at init time and
  * calculate correct address by adding the app's tls base.
  */
 /* __errno_location on ARM:
  * 0xb6f0b290 <__errno_location>:    ldr r3, [pc, #12]
  * 0xb6f0b292 <__errno_location+2>:  mrc 15, 0, r0, cr13, cr0, {3}
  * 0xb6f0b296 <__errno_location+6>:  add r3, pc
  * 0xb6f0b298 <__errno_location+8>:  ldr r3, [r3, #0]
  * 0xb6f0b29a <__errno_location+10>: adds r0, r0, r3
  * 0xb6f0b29c <__errno_location+12>: bx lr
  * It uses the predefined offset to get errno location in TLS,
  * and we should be able to reuse the code here.
  */
 static int libc_errno_tls_offs;
 static int *
 our_libc_errno_loc(void)
 {
     void *app_tls = os_get_app_seg_base(NULL, LIB_SEG_TLS);
     if (app_tls == NULL)
         return NULL;
     return (int *)(app_tls + libc_errno_tls_offs);
 }
 #endif

 /* i#238/PR 499179: libc errno preservation
  *
  * Errno location is per-thread so we store the
  * function globally and call it each time.  Note that pthreads seems
  * to be the one who provides per-thread errno: using raw syscalls to
  * create threads, we end up with a global errno:
  *
  *   > for i in linux.thread.*0/log.*; do grep 'libc errno' $i | head -1; done
  *   libc errno loc: 0x00007f153de26698
  *   libc errno loc: 0x00007f153de26698
  *   > for i in pthreads.pthreads.*0/log.*; do grep 'libc errno' $i | head -1; done
  *   libc errno loc: 0x00007fc24d1ce698
  *   libc errno loc: 0x00007fc24d1cd8b8
  *   libc errno loc: 0x00007fc24c7cc8b8
  */
 typedef int *(*errno_loc_t)(void);

 static errno_loc_t
 get_libc_errno_location(bool do_init)
 {
     static errno_loc_t libc_errno_loc;

     if (do_init) {
         module_iterator_t *mi = module_iterator_start();
         while (module_iterator_hasnext(mi)) {
             module_area_t *area = module_iterator_next(mi);
             const char *modname = GET_MODULE_NAME(&area->names);
             /* We ensure matches start to avoid matching "libgolibc.so".
              * GET_MODULE_NAME never includes the path: i#138 will add path.
              */
             if (modname != NULL && strstr(modname, "libc.so") == modname) {
                 bool found = true;
                 /* called during init when .data is writable */
                 libc_errno_loc = (errno_loc_t)
                     get_proc_address(area->start, "__errno_location");
                 ASSERT(libc_errno_loc != NULL);
                 LOG(GLOBAL, LOG_THREADS, 2, "libc errno loc func: "PFX"\n",
                     libc_errno_loc);
 #ifdef CLIENT_INTERFACE
                 /* Currently, the DR is loaded by system loader and hooked up
                  * to app's libc.  So right now, we still need this routine.
                  * we can remove this after libc independency and/or
                  * early injection
                  */
                 if (INTERNAL_OPTION(private_loader)) {
                     acquire_recursive_lock(&privload_lock);
                     if (privload_lookup_by_base(area->start) != NULL)
                         found = false;
                     release_recursive_lock(&privload_lock);
                 }
 #endif
                 if (found)
                     break;
             }
         }
         module_iterator_stop(mi);
 #if defined(HAVE_TLS) && defined(CLIENT_INTERFACE)
         /* i#598: init the libc errno's offset.  If we didn't find libc above,
          * then we don't need to do this.
          */
         if (INTERNAL_OPTION(private_loader) && libc_errno_loc != NULL) {
             void *dr_lib_tls_base = os_get_dr_seg_base(NULL, LIB_SEG_TLS);
             ASSERT(dr_lib_tls_base != NULL);
             libc_errno_tls_offs = (void *)libc_errno_loc() - dr_lib_tls_base;
             libc_errno_loc = &our_libc_errno_loc;
         }
 #endif
     }
     return libc_errno_loc;
 }

 /* i#238/PR 499179: our __errno_location isn't affecting libc so until
  * we have libc independence or our own private isolated libc we need
  * to preserve the app's libc's errno
  */
 int
 get_libc_errno(void)
 {
 #ifdef STANDALONE_UNIT_TEST
     errno_loc_t func = __errno_location;
 #else
     errno_loc_t func = get_libc_errno_location(false);
 #endif
     if (func == NULL) {
         /* libc hasn't been loaded yet or we're doing early injection. */
         return 0;
     } else {
         int *loc = (*func)();
         ASSERT(loc != NULL);
         LOG(THREAD_GET, LOG_THREADS, 5, "libc errno loc: "PFX"\n", loc);
         if (loc != NULL)
             return *loc;
     }
     return 0;
 }

 /* N.B.: pthreads has two other locations it keeps on a per-thread basis:
  * h_errno and res_state.  See glibc-2.2.4/linuxthreads/errno.c.
  * If dynamo ever modifies those we'll need to do to them what we now do to
  * errno.
  */

 /* The environment vars exhibit totally messed up behavior when someone
  * does an execve of /bin/sh -- not sure what's going on, but using our
  * own implementation of unsetenv fixes all our problems.  If we use
  * libc's, unsetenv either does nothing or ends up having getenv return
  * NULL for other vars that are obviously set (by iterating through environ).
  * FIXME: find out the real story here.
  */
 int
 our_unsetenv(const char *name)
 {
     size_t len;
     char **ep;

     if (name == NULL || *name == '\0' || strchr (name, '=') != NULL) {
         return -1;
     }
     ASSERT(our_environ != NULL);
     if (our_environ == NULL)
         return -1;

     len = strlen (name);

     /* FIXME: glibc code grabs a lock here, we don't have access to that lock
      * LOCK;
      */

     ep = our_environ;
     while (*ep != NULL)
         if (!strncmp (*ep, name, len) && (*ep)[len] == '=') {
             /* Found it.  Remove this pointer by moving later ones back.  */
             char **dp = ep;

             do {
                 dp[0] = dp[1];
             } while (*dp++);
             /* Continue the loop in case NAME appears again.  */
         } else
             ++ep;

     /* FIXME: glibc code unlocks here, we don't have access to that lock
      * UNLOCK;
      */

     return 0;
 }

 /* i#46: Private getenv.
  */
 char *
 getenv(const char *name)
 {
     char **ep = our_environ;
     size_t i;
     size_t name_len;
     if (name == NULL || name[0] == '\0' || strchr(name, '=') != NULL) {
         return NULL;
     }
     ASSERT_MESSAGE(CHKLVL_ASSERTS, "our_environ is missing.  _init() or "
                    "dynamorio_set_envp() were not called", our_environ != NULL);
     if (our_environ == NULL)
         return NULL;
     name_len = strlen(name);
     for (i = 0; ep[i] != NULL; i++) {
         if (strncmp(ep[i], name, name_len) == 0 && ep[i][name_len] == '=') {
             return ep[i] + name_len + 1;
         }
     }
     return NULL;
 }

 /* Work around drpreload's _init going first.  We can get envp in our own _init
  * routine down below, but drpreload.so comes first and calls
  * dynamorio_app_init before our own _init routine gets called.  Apps using the
  * app API are unaffected because our _init routine will have run by then.  For
  * STATIC_LIBRARY, we simply set our_environ below in our_init().
  */
 DYNAMORIO_EXPORT
 void
 dynamorio_set_envp(char **envp)
 {
     our_environ = envp;
 }

 /* shared library init */
 int
 our_init(int argc, char **argv, char **envp)
 {
     /* if do not want to use drpreload.so, we can take over here */
     extern void dynamorio_app_take_over(void);
     bool takeover = false;
 #ifdef INIT_TAKE_OVER
     takeover = true;
 #endif
 #ifdef VMX86_SERVER
     /* PR 391765: take over here instead of using preload */
     takeover = os_in_vmkernel_classic();
 #endif
     if (our_environ != NULL) {
         /* Set by dynamorio_set_envp above.  These should agree. */
         ASSERT(our_environ == envp);
     } else {
         our_environ = envp;
     }
     if (!takeover) {
         const char *takeover_env = getenv("DYNAMORIO_TAKEOVER_IN_INIT");
         if (takeover_env != NULL && strcmp(takeover_env, "1") == 0) {
             takeover = true;
         }
     }
     if (takeover) {
         if (dynamorio_app_init() == 0 /* success */) {
             dynamorio_app_take_over();
         }
     }
     return 0;
 }

 #if defined(STATIC_LIBRARY) || defined(STANDALONE_UNIT_TEST)
 /* If we're getting linked into a binary that already has an _init definition
  * like the app's exe or unit_tests, we add a pointer to our_init() to the
  * .init_array section.  We can't use the constructor attribute because not all
  * toolchains pass the args and environment to the constructor.
  */
 static init_fn_t
 __attribute__ ((section (".init_array"), aligned (sizeof (void *)), used))
 init_array[] = {
     our_init
 };
 #else
 /* If we're a normal shared object, then we override _init.
  */
 int
 _init(int argc, char **argv, char **envp)
 {
     return our_init(argc, argv, envp);
 }
 #endif

 bool
 kernel_is_64bit(void)
 {
     return kernel_64bit;
 }

 #ifdef MACOS
 /* XXX: if we get enough of these, move to os_macos.c or sthg */
 static bool
 sysctl_query(int level0, int level1, void *buf, size_t bufsz)
 {
     int res;
     int name[2];
     size_t len = bufsz;
     name[0] = level0;
     name[1] = level1;
     res = dynamorio_syscall(SYS___sysctl, 6, &name, 2, buf, &len, NULL, 0);
     return (res >= 0);
 }
 #endif

 static void
 get_uname(void)
 {
     /* assumption: only called at init, so we don't need any synch
      * or .data unprot
      */
     static struct utsname uinfo; /* can be large, avoid stack overflow */
 #ifdef MACOS
     if (!sysctl_query(CTL_KERN, KERN_OSTYPE, &uinfo.sysname, sizeof(uinfo.sysname)) ||
         !sysctl_query(CTL_KERN, KERN_HOSTNAME, &uinfo.nodename,
                       sizeof(uinfo.nodename)) ||
         !sysctl_query(CTL_KERN, KERN_OSRELEASE, &uinfo.release, sizeof(uinfo.release)) ||
         !sysctl_query(CTL_KERN, KERN_VERSION, &uinfo.version, sizeof(uinfo.version)) ||
         !sysctl_query(CTL_HW, HW_MACHINE, &uinfo.machine, sizeof(uinfo.machine))) {
         ASSERT(false && "sysctl queries failed");
         return;
     }
 #else
     DEBUG_DECLARE(int res =)
         dynamorio_syscall(SYS_uname, 1, (ptr_uint_t)&uinfo);
     ASSERT(res >= 0);
 #endif
     LOG(GLOBAL, LOG_TOP, 1, "uname:\n\tsysname: %s\n", uinfo.sysname);
     LOG(GLOBAL, LOG_TOP, 1, "\tnodename: %s\n", uinfo.nodename);
     LOG(GLOBAL, LOG_TOP, 1, "\trelease: %s\n", uinfo.release);
     LOG(GLOBAL, LOG_TOP, 1, "\tversion: %s\n", uinfo.version);
     LOG(GLOBAL, LOG_TOP, 1, "\tmachine: %s\n", uinfo.machine);
     if (strncmp(uinfo.machine, "x86_64", sizeof("x86_64")) == 0)
         kernel_64bit = true;
 #ifdef MACOS
     /* XXX: I would skip these checks for standalone so we don't have to set env
      * vars for frontends to see the options but I'm still afraid of some syscall
      * crash with no output: I'd rather have two messages than silent crashing.
      */
     if (DYNAMO_OPTION(max_supported_os_version) != 0) { /* 0 disables */
         /* We only support OSX 10.7.5 - 10.9.1.  That means kernels 11.x-13.x. */
 # define MIN_DARWIN_VERSION_SUPPORTED 11
         int kernel_major;
         if (sscanf(uinfo.release, "%d", &kernel_major) != 1 ||
             kernel_major > DYNAMO_OPTION(max_supported_os_version) ||
             kernel_major < MIN_DARWIN_VERSION_SUPPORTED) {
             /* We make this non-fatal as it's likely DR will work */
             SYSLOG(SYSLOG_WARNING, UNSUPPORTED_OS_VERSION, 3, get_application_name(),
                    get_application_pid(), uinfo.release);
         }
     }
 #endif
 }

 /* os-specific initializations */
 void
 os_init(void)
 {
     ksynch_init();

     get_uname();

     /* Populate global data caches. */
     get_application_name();
     get_application_base();

     /* determine whether gettid is provided and needed for threads,
      * or whether getpid suffices.  even 2.4 kernels have gettid
      * (maps to getpid), don't have an old enough target to test this.
      */
 #ifdef MACOS
     kernel_thread_groups = (dynamorio_syscall(SYS_thread_selfid, 0) >= 0);
 #else
     kernel_thread_groups = (dynamorio_syscall(SYS_gettid, 0) >= 0);
 #endif
     LOG(GLOBAL, LOG_TOP|LOG_STATS, 1, "thread id is from %s\n",
         kernel_thread_groups ? "gettid" : "getpid");
 #ifdef MACOS
     /* SYS_thread_selfid was added in 10.6.  We have no simple way to get the
      * thread id on 10.5, so we don't support it.
      */
     if (!kernel_thread_groups) {
         SYSLOG(SYSLOG_WARNING, UNSUPPORTED_OS_VERSION, 3,
                get_application_name(), get_application_pid(), "Mac OSX 10.5 or earlier");
     }
 #else
     ASSERT_CURIOSITY(kernel_thread_groups);
 #endif

     pid_cached = get_process_id();

 #ifdef VMX86_SERVER
     vmk_init();
 #endif

     signal_init();
     /* We now set up an early fault handler for safe_read() (i#350) */
     fault_handling_initialized = true;

     memquery_init();

 #ifdef PROFILE_RDTSC
     if (dynamo_options.profile_times) {
         ASSERT_NOT_TESTED();
         kilo_hertz = get_timer_frequency();
         LOG(GLOBAL, LOG_TOP|LOG_STATS, 1, "CPU MHz is %d\n", kilo_hertz/1000);
     }
 #endif /* PROFILE_RDTSC */
     /* Needs to be after heap_init */
     IF_NO_MEMQUERY(memcache_init());

     /* we didn't have heap in os_file_init() so create and add global logfile now */
     fd_table = generic_hash_create(GLOBAL_DCONTEXT, INIT_HTABLE_SIZE_FD,
                                    80 /* load factor: not perf-critical */,
                                    HASHTABLE_SHARED | HASHTABLE_PERSISTENT,
                                    NULL _IF_DEBUG("fd table"));
 #ifdef DEBUG
     if (GLOBAL != INVALID_FILE)
         fd_table_add(GLOBAL, OS_OPEN_CLOSE_ON_FORK);
 #endif

     /* Ensure initialization */
     get_dynamorio_dll_start();
 }

 /* called before any logfiles are opened */
 void
 os_file_init(void)
 {
     /* We steal fds from the app for better transparency.  We lower the max file
      * descriptor limit as viewed by the app, and block SYS_dup{2,3} and
      * SYS_fcntl(F_DUPFD*) from creating a file explicitly in our space.  We do
      * not try to stop incremental file opening from extending into our space:
      * if the app really is running out of fds, we'll give it some of ours:
      * after all we probably don't need all -steal_fds, and if we realy need fds
      * we typically open them at startup.  We also don't bother watching all
      * syscalls that take in fds from affecting our fds.
      */
     if (DYNAMO_OPTION(steal_fds) > 0) {
         struct rlimit rlimit_nofile;
         if (dynamorio_syscall(SYS_getrlimit, 2, RLIMIT_NOFILE, &rlimit_nofile) != 0) {
             /* linux default is 1024 */
             SYSLOG_INTERNAL_WARNING("getrlimit RLIMIT_NOFILE failed"); /* can't LOG yet */
             rlimit_nofile.rlim_cur = 1024;
             rlimit_nofile.rlim_max = 1024;
         }
         /* pretend the limit is lower and reserve the top spots for us.
          * for simplicity and to give as much room as possible to app,
          * raise soft limit to equal hard limit.
          * if an app really depends on a low soft limit, they can run
          * with -steal_fds 0.
          */
         if (rlimit_nofile.rlim_max > DYNAMO_OPTION(steal_fds)) {
             int res;
             app_rlimit_nofile.rlim_max = rlimit_nofile.rlim_max - DYNAMO_OPTION(steal_fds);
             app_rlimit_nofile.rlim_cur = app_rlimit_nofile.rlim_max;

             rlimit_nofile.rlim_cur = rlimit_nofile.rlim_max;
             res = dynamorio_syscall(SYS_setrlimit, 2, RLIMIT_NOFILE, &rlimit_nofile);
             if (res != 0) {
                 SYSLOG_INTERNAL_WARNING("unable to raise RLIMIT_NOFILE soft limit: %d",
                                         res);
             }
         } else /* not fatal: we'll just end up using fds in app space */
             SYSLOG_INTERNAL_WARNING("unable to reserve fds");
     }

     /* we don't have heap set up yet so we init fd_table in os_init */
 }

 /* we need to re-cache after a fork */
 static char *
 get_application_pid_helper(bool ignore_cache)
 {
     static char pidstr[16];

     if (!pidstr[0] || ignore_cache) {
         int pid = get_process_id();
         snprintf(pidstr, sizeof(pidstr)-1, "%d", pid);
     }
     return pidstr;
 }

 /* get application pid, (cached), used for event logging */
 char*
 get_application_pid()
 {
     return get_application_pid_helper(false);
 }

 /* i#907: Called during early injection before data section protection to avoid
  * issues with /proc/self/exe.
  */
 void
 set_executable_path(const char *exe_path)
 {
     strncpy(executable_path, exe_path, BUFFER_SIZE_ELEMENTS(executable_path));
     NULL_TERMINATE_BUFFER(executable_path);
 }

 /* i#189: we need to re-cache after a fork */
 static char *
 get_application_name_helper(bool ignore_cache, bool full_path)
 {
     if (!executable_path[0] || ignore_cache) {
 #ifdef VMX86_SERVER
         if (os_in_vmkernel_userworld()) {
             vmk_getnamefrompid(pid, executable_path, sizeof(executable_path));
         } else
 #endif
         if (DYNAMO_OPTION(early_inject)) {
             ASSERT(executable_path[0] != '\0' &&
                    "i#907: Can't read /proc/self/exe for early injection");
         } else {
 #ifdef LINUX
             /* Populate cache from /proc/self/exe link. */
             strncpy(executable_path, read_proc_self_exe(ignore_cache),
                     BUFFER_SIZE_ELEMENTS(executable_path));
 #else
             /* OSX kernel puts full app exec path above envp */
             char *c, **env = our_environ;
             do {
                 env++;
             } while (*env != NULL);
             env++; /* Skip the NULL separating the envp array from exec_path */
             c = *env;
             /* If our frontends always absolute-ize paths prior to exec,
              * this should usually be absolute -- but we go ahead and
              * handle relative just in case (and to handle child processes).
              * We add the cur dir, but note that the resulting path can
              * still contain . or .. so it's not normalized (but it is a
              * correct absolute path).  Xref i#1402, i#1406, i#1407.
              */
             if (*c != '/') {
                 int len;
                 if (!os_get_current_dir(executable_path,
                                         BUFFER_SIZE_ELEMENTS(executable_path)))
                     len = 0;
                 else
                     len = strlen(executable_path);
                 snprintf(executable_path + len,
                          BUFFER_SIZE_ELEMENTS(executable_path) - len,
                          "%s%s", len > 0 ? "/" : "", c);
             } else
                 strncpy(executable_path, c, BUFFER_SIZE_ELEMENTS(executable_path));
 #endif
             NULL_TERMINATE_BUFFER(executable_path);
             /* FIXME: Fall back on /proc/self/cmdline and maybe argv[0] from
              * _init().
              */
             ASSERT(strlen(executable_path) > 0 &&
                    "readlink /proc/self/exe failed");
         }
     }

     /* Get basename. */
     if (executable_basename == NULL || ignore_cache) {
         executable_basename = strrchr(executable_path, '/');
         executable_basename = (executable_basename == NULL ?
                                executable_path : executable_basename + 1);
     }
     return (full_path ? executable_path : executable_basename);
 }

 /* get application name, (cached), used for event logging */
 char *
 get_application_name(void)
 {
     return get_application_name_helper(false, true /* full path */);
 }

 /* Note: this is exported so that libdrpreload.so (preload.c) can use it to
  * get process names to do selective process following (PR 212034).  The
  * alternative is to duplicate or compile in this code into libdrpreload.so,
  * which is messy.  Besides, libdynamorio.so is already loaded into the process
  * and avaiable, so cleaner to just use functions from it.
  */
 DYNAMORIO_EXPORT const char *
 get_application_short_name(void)
 {
     return get_application_name_helper(false, false /* short name */);
 }

 /* Processor information provided by kernel */
 #define PROC_CPUINFO "/proc/cpuinfo"
 #define CPUMHZ_LINE_LENGTH  64
 #define CPUMHZ_LINE_FORMAT "cpu MHz\t\t: %lu.%03lu\n"
 /* printed in /usr/src/linux-2.4/arch/i386/kernel/setup.c calibrated in time.c */
 /* seq_printf(m, "cpu MHz\t\t: %lu.%03lu\n", cpu_khz / 1000, (cpu_khz % 1000)) */
 /* e.g. cpu MHz           : 1594.851 */
 static timestamp_t
 get_timer_frequency_cpuinfo(void)
 {
     file_t cpuinfo;
     ssize_t nread;
     char *buf;
     char *mhz_line;
     ulong cpu_mhz = 1000;
     ulong cpu_khz = 0;

     cpuinfo = os_open(PROC_CPUINFO, OS_OPEN_READ);

     /* This can happen in a chroot or if /proc is disabled. */
     if (cpuinfo == INVALID_FILE)
         return 1000 * 1000;  /* 1 GHz */

     /* cpu MHz is typically in the first 4096 bytes.  If not, or we get a short
      * or interrupted read, our timer frequency estimate will be off, but it's
      * not the end of the world.
      * FIXME: Factor a buffered file reader out of our maps iterator if we want
      * to do this the right way.
      */
     buf = global_heap_alloc(PAGE_SIZE HEAPACCT(ACCT_OTHER));
     nread = os_read(cpuinfo, buf, PAGE_SIZE - 1);
     if (nread > 0) {
         buf[nread] = '\0';
         mhz_line = strstr(buf, "cpu MHz\t\t:");
         if (mhz_line != NULL &&
             sscanf(mhz_line, CPUMHZ_LINE_FORMAT, &cpu_mhz, &cpu_khz) == 2) {
             LOG(GLOBAL, LOG_ALL, 2, "Processor speed exactly %lu.%03luMHz\n",
                 cpu_mhz, cpu_khz);
         }
     }
     global_heap_free(buf, PAGE_SIZE HEAPACCT(ACCT_OTHER));
     os_close(cpuinfo);

     return cpu_mhz * 1000 + cpu_khz;
 }

 timestamp_t
 get_timer_frequency()
 {
 #ifdef VMX86_SERVER
     if (os_in_vmkernel_userworld()) {
         return vmk_get_timer_frequency();
     }
 #endif
    return get_timer_frequency_cpuinfo();
 }

 /* DR has standardized on UTC time which counts from since Jan 1, 1601.
  * That's the Windows standard.  But Linux uses the Epoch of Jan 1, 1970.
  */
 #define UTC_TO_EPOCH_SECONDS 11644473600

 /* seconds since 1601 */
 uint
 query_time_seconds(void)
 {
 #ifdef MACOS
     struct timeval tv;
     /* MacOS returns usecs:secs and does not set the timeval struct */
     uint64 val = dynamorio_syscall(SYS_gettimeofday, 2, &tv, NULL);
     if ((int)val < 0)
         return 0;
     return (uint)val + UTC_TO_EPOCH_SECONDS;
 #else
     return (uint) dynamorio_syscall(SYS_time, 1, NULL) + UTC_TO_EPOCH_SECONDS;
 #endif
 }

 /* milliseconds since 1601 */
 uint64
 query_time_millis()
 {
     struct timeval current_time;
 #ifdef MACOS
     /* MacOS returns usecs:secs and does not set the timeval struct */
     uint64 val = dynamorio_syscall(SYS_gettimeofday, 2, &current_time, NULL);
     current_time.tv_sec = (uint) val;
     current_time.tv_usec = (uint)(val >> 32);
     if ((int)val > 0) {
 #else
     if (dynamorio_syscall(SYS_gettimeofday, 2, &current_time, NULL) >= 0) {
 #endif
         uint64 res = (((uint64)current_time.tv_sec) * 1000) +
             (current_time.tv_usec / 1000);
         res += UTC_TO_EPOCH_SECONDS * 1000;
         return res;
     } else {
         ASSERT_NOT_REACHED();
         return 0;
     }
 }

 #ifdef RETURN_AFTER_CALL
 /* Finds the bottom of the call stack, presumably at program startup. */
 /* This routine is a copycat of internal_dump_callstack and makes assumptions about program state,
    i.e. that frame pointers are valid and should be used only in well known points for release build.
 */
 static app_pc
 find_stack_bottom()
 {
     app_pc retaddr = 0;
     int depth = 0;
     reg_t *fp;
     /* from dump_dr_callstack() */
     asm("mov  %%"ASM_XBP", %0" : "=m"(fp));

     LOG(THREAD_GET, LOG_ALL, 3, "Find stack bottom:\n");
     while (fp != NULL && is_readable_without_exception((byte *)fp, sizeof(reg_t)*2)) {
         retaddr = (app_pc)*(fp+1);      /* presumably also readable */
         LOG(THREAD_GET, LOG_ALL, 3,
             "\tframe ptr "PFX" => parent "PFX", ret = "PFX"\n", fp, *fp, retaddr);
         depth++;
         /* yes I've seen weird recursive cases before */
         if (fp == (reg_t *) *fp || depth > 100)
             break;
         fp = (reg_t *) *fp;
     }
     return retaddr;
 }
 #endif /* RETURN_AFTER_CALL */

 /* os-specific atexit cleanup */
 void
 os_slow_exit(void)
 {
     signal_exit();
     memquery_exit();
     ksynch_exit();

     generic_hash_destroy(GLOBAL_DCONTEXT, fd_table);
     fd_table = NULL;

     DELETE_LOCK(set_thread_area_lock);
 #ifdef CLIENT_INTERFACE
     DELETE_LOCK(client_tls_lock);
 #endif
     IF_NO_MEMQUERY(memcache_exit());
 }

 /* os-specific atexit cleanup */
 void
 os_fast_exit(void)
 {
     /* nothing */
 }

 void
 os_terminate_with_code(dcontext_t *dcontext, terminate_flags_t flags, int exit_code)
 {
     /* i#1319: we support a signal via 2nd byte */
     bool use_signal = exit_code > 0x00ff;
     /* XXX: TERMINATE_THREAD not supported */
     ASSERT_NOT_IMPLEMENTED(TEST(TERMINATE_PROCESS, flags));
     if (use_signal) {
         int sig = (exit_code & 0xff00) >> 8;
         os_terminate_via_signal(dcontext, flags, sig);
         ASSERT_NOT_REACHED();
     }
     if (TEST(TERMINATE_CLEANUP, flags)) {
         /* we enter from several different places, so rewind until top-level kstat */
         KSTOP_REWIND_UNTIL(thread_measured);
         cleanup_and_terminate(dcontext, SYSNUM_EXIT_PROCESS, exit_code, 0,
                               true/*whole process*/, 0, 0);
     } else {
         /* clean up may be impossible - just terminate */
         config_exit(); /* delete .1config file */
         exit_process_syscall(exit_code);
     }
 }

 void
 os_terminate(dcontext_t *dcontext, terminate_flags_t flags)
 {
     os_terminate_with_code(dcontext, flags, -1);
 }

 int
 os_timeout(int time_in_milliseconds)
 {
     ASSERT_NOT_IMPLEMENTED(false);
     return 0;
 }

 /************************************************************************
  * SEGMENT STEALING
  *
  * Not easy to make truly transparent -- but the alternative of dispatch
  * by thread id on global memory has performance implications.
  * Pull the non-STEAL_SEGMENT code out of the cvs attic for a base if
  * transparency becomes more of a problem.
  */

 #define TLS_LOCAL_STATE_OFFSET (offsetof(os_local_state_t, state))

 /* offset from top of page */
 #define TLS_OS_LOCAL_STATE     0x00

 #define TLS_SELF_OFFSET        (TLS_OS_LOCAL_STATE + offsetof(os_local_state_t, self))
 #define TLS_THREAD_ID_OFFSET   (TLS_OS_LOCAL_STATE + offsetof(os_local_state_t, tid))
 #define TLS_DCONTEXT_OFFSET    (TLS_OS_LOCAL_STATE + TLS_DCONTEXT_SLOT)

 /* they should be used with os_tls_offset, so do not need add TLS_OS_LOCAL_STATE here */
 #define TLS_APP_GS_BASE_OFFSET (offsetof(os_local_state_t, app_gs_base))
 #define TLS_APP_FS_BASE_OFFSET (offsetof(os_local_state_t, app_fs_base))
 #define TLS_APP_GS_OFFSET (offsetof(os_local_state_t, app_gs))
 #define TLS_APP_FS_OFFSET (offsetof(os_local_state_t, app_fs))

 /* N.B.: imm and idx are ushorts!
  * We use %c[0-9] to get gcc to emit an integer constant without a leading $ for
  * the segment offset.  See the documentation here:
  * http://gcc.gnu.org/onlinedocs/gccint/Output-Template.html#Output-Template
  * Also, var needs to match the pointer size, or else we'll get stack corruption.
  * XXX: This is marked volatile prevent gcc from speculating this code before
  * checks for is_segment_register_initialized(), but if we could find a more
  * precise constraint, then the compiler would be able to optimize better.  See
  * glibc comments on THREAD_SELF.
  */
 #ifdef X86
 # define WRITE_TLS_SLOT_IMM(imm, var)                                 \
     IF_NOT_HAVE_TLS(ASSERT_NOT_REACHED());                            \
     ASSERT(sizeof(var) == sizeof(void*));                             \
     asm volatile("mov %0, %"ASM_SEG":%c1" : : "r"(var), "i"(imm));

 # define READ_TLS_SLOT_IMM(imm, var)                 \
     IF_NOT_HAVE_TLS(ASSERT_NOT_REACHED());           \
     ASSERT(sizeof(var) == sizeof(void*));            \
     asm volatile("mov %"ASM_SEG":%c1, %0" : "=r"(var) : "i"(imm));

 /* FIXME: need dedicated-storage var for _TLS_SLOT macros, can't use expr */
 # define WRITE_TLS_SLOT(idx, var)                           \
     IF_NOT_HAVE_TLS(ASSERT_NOT_REACHED());                  \
     ASSERT(sizeof(var) == sizeof(void*));                   \
     ASSERT(sizeof(idx) == 2);                               \
     asm("mov %0, %%"ASM_XAX : : "m"((var)) : ASM_XAX);      \
     asm("movzw"IF_X64_ELSE("q","l")" %0, %%"ASM_XDX : : "m"((idx)) : ASM_XDX); \
     asm("mov %%"ASM_XAX", %"ASM_SEG":(%%"ASM_XDX")" : : : ASM_XAX, ASM_XDX);

 # define READ_TLS_SLOT(idx, var)                                   \
     ASSERT(sizeof(var) == sizeof(void*));                          \
     ASSERT(sizeof(idx) == 2);                                      \
     asm("movzw"IF_X64_ELSE("q","l")" %0, %%"ASM_XAX : : "m"((idx)) : ASM_XAX); \
     asm("mov %"ASM_SEG":(%%"ASM_XAX"), %%"ASM_XAX : : : ASM_XAX);  \
     asm("mov %%"ASM_XAX", %0" : "=m"((var)) : : ASM_XAX);
 #elif defined(ARM)
 # define WRITE_TLS_SLOT_IMM(imm, var) \
     __asm__ __volatile__(             \
       READ_TP_TO_R3                   \
       "str %0, ["ASM_R3", %1] \n\t"   \
       : : "r" (var), "i" (imm)        \
       : "memory", ASM_R3);
 # define READ_TLS_SLOT_IMM(imm, var) \
     __asm__ __volatile__(            \
       READ_TP_TO_R3                  \
       "ldr %0, ["ASM_R3", %1] \n\t"  \
       : "=r" (var)                   \
       : "i" (imm)                    \
       : ASM_R3);
 # define WRITE_TLS_SLOT(idx, var)            \
     __asm__ __volatile__(                    \
       READ_TP_TO_R3                          \
       "add "ASM_R3", "ASM_R3", %1 \n\t"      \
       "str %0, ["ASM_R3"]   \n\t"            \
       : : "r" (var), "r" (idx * sizeof(var)) \
       : "memory", ASM_R3);
 # define READ_TLS_SLOT(idx, var)          \
     __asm__ __volatile__(                 \
       READ_TP_TO_R3                       \
       "add "ASM_R3", "ASM_R3", %1 \n\t"   \
       "ldr %0, ["ASM_R3"]   \n\t"         \
       : "=r" (var)                        \
       : "r"  (idx * sizeof(var))          \
       : ASM_R3);
 #endif /* X86/ARM */

 /* FIXME: assumes that fs/gs is not already in use by app */
 static bool
 is_segment_register_initialized(void)
 {
     if (read_selector(SEG_TLS) != 0)
         return true;
 #ifdef X64
     if (tls_dr_using_msr()) {
         /* When the MSR is used, the selector in the register remains 0.
          * We can't clear the MSR early in a new thread and then look for
          * a zero base here b/c if kernel decides to use GDT that zeroing
          * will set the selector, unless we want to assume we know when
          * the kernel uses the GDT.
          * Instead we make a syscall to get the tid.  This should be ok
          * perf-wise b/c the common case is the non-zero above.
          */
         byte *base = tls_get_fs_gs_segment_base(SEG_TLS);
         ASSERT(tls_global_type == TLS_TYPE_ARCH_PRCTL);
         if (base != (byte *) POINTER_MAX && base != NULL) {
             os_local_state_t *os_tls = (os_local_state_t *) base;
             return (os_tls->tid == get_sys_thread_id() ||
                     /* The child of a fork will initially come here */
                     os_tls->state.spill_space.dcontext->owning_process ==
                     get_parent_id());
         }
     }
 #endif
     return false;
 }

 /* converts a local_state_t offset to a segment offset */
 ushort
 os_tls_offset(ushort tls_offs)
 {
     /* no ushort truncation issues b/c TLS_LOCAL_STATE_OFFSET is 0 */
     IF_NOT_HAVE_TLS(ASSERT_NOT_REACHED());
     ASSERT(TLS_LOCAL_STATE_OFFSET == 0);
     return (TLS_LOCAL_STATE_OFFSET + tls_offs);
 }

 /* XXX: Will return NULL if called before os_thread_init(), which sets
  * ostd->dr_fs/gs_base.
  */
 void *
 os_get_dr_seg_base(dcontext_t *dcontext, reg_id_t seg)
 {
     os_thread_data_t *ostd;

     IF_NOT_HAVE_TLS(ASSERT_NOT_REACHED());
     /* FIXME i#1551: we need a better alias name for FS/GS on X86 and
      * DR_REG_TPIDRURW/DR_REG_TPIDRURO on ARM.
      */
     ASSERT(seg == SEG_TLS || seg == LIB_SEG_TLS);

     if (dcontext == NULL)
         dcontext = get_thread_private_dcontext();
     if (dcontext == NULL)
         return NULL;
     ostd = (os_thread_data_t *)dcontext->os_field;
     if (seg == IF_X86_ELSE(SEG_FS, DR_REG_TPIDRURW))
         return ostd->dr_fs_base;
     else
         return ostd->dr_gs_base;
     return NULL;
 }

 os_local_state_t *
 get_os_tls(void)
 {
     os_local_state_t *os_tls;
     ASSERT(is_segment_register_initialized());
     READ_TLS_SLOT_IMM(TLS_SELF_OFFSET, os_tls);
     return os_tls;
 }

 /* Obtain TLS from dcontext directly, which succeeds in pre-thread-init
  * situations where get_os_tls() fails.
  */
 static os_local_state_t *
 get_os_tls_from_dc(dcontext_t *dcontext)
 {
     byte *local_state;
     ASSERT(dcontext != NULL);
     local_state = (byte*)dcontext->local_state;
     if (local_state == NULL)
         return NULL;
     return (os_local_state_t *)(local_state - offsetof(os_local_state_t, state));
 }

 void *
 os_get_app_seg_base(dcontext_t *dcontext, reg_id_t seg)
 {
     os_local_state_t *os_tls;
     IF_NOT_HAVE_TLS(ASSERT_NOT_REACHED());
     ASSERT(IF_X86_ELSE((seg == SEG_FS || seg == SEG_GS),
                        (seg == DR_REG_TPIDRURW || DR_REG_TPIDRURO)));
     if (dcontext == NULL)
         dcontext = get_thread_private_dcontext();
     if (dcontext == NULL) {
         /* No dcontext means we haven't initialized TLS, so we haven't replaced
          * the app's segments.  get_segment_base is expensive, but this should
          * be rare.  Re-examine if it pops up in a profile.
          */
         return get_segment_base(seg);
     }
     os_tls = get_os_tls_from_dc(dcontext);
     if (seg == IF_X86_ELSE(SEG_FS, DR_REG_TPIDRURW))
         return os_tls->app_fs_base;
     else
         return os_tls->app_gs_base;
     return NULL;
 }

 ushort
 os_get_app_seg_base_offset(reg_id_t seg)
 {
 #ifdef X86
     IF_NOT_HAVE_TLS(ASSERT_NOT_REACHED());
     ASSERT(TLS_LOCAL_STATE_OFFSET == 0);
     if (seg == SEG_FS)
         return TLS_APP_FS_BASE_OFFSET;
     else if (seg == SEG_GS)
         return TLS_APP_GS_BASE_OFFSET;
 #endif
     /* FIXME i#1551: NYI on ARM */
     ASSERT_NOT_REACHED();
     return 0;
 }

 ushort
 os_get_app_seg_offset(reg_id_t seg)
 {
 #ifdef X86
     IF_NOT_HAVE_TLS(ASSERT_NOT_REACHED());
     ASSERT(TLS_LOCAL_STATE_OFFSET == 0);
     if (seg == SEG_FS)
         return TLS_APP_FS_OFFSET;
     else if (seg == SEG_GS)
         return TLS_APP_GS_OFFSET;
 #endif
     /* FIXME i#1551: NYI on ARM */
     ASSERT_NOT_REACHED();
     return 0;
 }

 void *
 get_tls(ushort tls_offs)
 {
     void *val;
     READ_TLS_SLOT(tls_offs, val);
     return val;
 }

 void
 set_tls(ushort tls_offs, void *value)
 {
     WRITE_TLS_SLOT(tls_offs, value);
 }


 /* Returns POINTER_MAX on failure.
  * Assumes that cs, ss, ds, and es are flat.
  * Should we export this to clients?  For now they can get
  * this information via opnd_compute_address().
  */
 byte *
 get_segment_base(uint seg)
 {
 #ifdef X86
     if (seg == SEG_CS || seg == SEG_SS || seg == SEG_DS || seg == SEG_ES)
         return NULL;
 # ifdef HAVE_TLS
     return tls_get_fs_gs_segment_base(seg);
 # else
     return (byte *) POINTER_MAX;
  #endif /* HAVE_TLS */
 #elif defined(ARM)
     /* FIXME i#1551: NYI on ARM */
     ASSERT_NOT_REACHED();
     return (byte *) POINTER_MAX;
 #endif
 }

 /* i#572: handle opnd_compute_address to return the application
  * segment base value.
  */
 byte *
 get_app_segment_base(uint seg)
 {
 #ifdef X86
     if (seg == SEG_CS || seg == SEG_SS || seg == SEG_DS || seg == SEG_ES)
         return NULL;
 #endif /* X86 */
     if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
         return get_tls(os_get_app_seg_base_offset(seg));
     }
     return get_segment_base(seg);
 }

 local_state_extended_t *
 get_local_state_extended()
 {
     os_local_state_t *os_tls;
     ASSERT(is_segment_register_initialized());
     READ_TLS_SLOT_IMM(TLS_SELF_OFFSET, os_tls);
     return &(os_tls->state);
 }

 local_state_t *
 get_local_state()
 {
 #ifdef HAVE_TLS
     return (local_state_t *) get_local_state_extended();
 #else
     return NULL;
 #endif
 }

 /* i#107: handle segment register usage conflicts between app and dr:
  * os_handle_mov_seg updates the app's tls selector maintained by DR.
  * It is called before entering code cache in dispatch_enter_fcache.
  */
 void
 os_handle_mov_seg(dcontext_t *dcontext, byte *pc)
 {
 #ifdef X86
     instr_t instr;
     opnd_t opnd;
     reg_id_t seg;
     ushort sel = 0;
     our_modify_ldt_t *desc;
     int desc_idx;
     os_local_state_t *os_tls;
     os_thread_data_t *ostd;

     instr_init(dcontext, &instr);
     decode_cti(dcontext, pc, &instr);
     /* the first instr must be mov seg */
     ASSERT(instr_get_opcode(&instr) == OP_mov_seg);
     opnd = instr_get_dst(&instr, 0);
     ASSERT(opnd_is_reg(opnd));
     seg = opnd_get_reg(opnd);
     ASSERT(reg_is_segment(seg));

     ostd = (os_thread_data_t *)dcontext->os_field;
     desc = (our_modify_ldt_t *)ostd->app_thread_areas;
     os_tls = get_os_tls();

     /* get the selector value */
     opnd = instr_get_src(&instr, 0);
     if (opnd_is_reg(opnd)) {
         sel = (ushort)reg_get_value_priv(opnd_get_reg(opnd),
                                          get_mcontext(dcontext));
     } else {
         void *ptr;
         ptr = (ushort *)opnd_compute_address_priv(opnd, get_mcontext(dcontext));
         ASSERT(ptr != NULL);
         if (!safe_read(ptr, sizeof(sel), &sel)) {
             /* FIXME: if invalid address, should deliver a signal to user. */
             ASSERT_NOT_IMPLEMENTED(false);
         }
     }
     /* calculate the entry_number */
     desc_idx = SELECTOR_INDEX(sel) - tls_min_index();
     if (seg == SEG_GS) {
         os_tls->app_gs = sel;
         os_tls->app_gs_base = (void *)(ptr_uint_t) desc[desc_idx].base_addr;
     } else {
         os_tls->app_fs = sel;
         os_tls->app_fs_base = (void *)(ptr_uint_t) desc[desc_idx].base_addr;
     }
     instr_free(dcontext, &instr);
     LOG(THREAD_GET, LOG_THREADS, 2,
         "thread "TIDFMT" segment change %s to selector 0x%x => app fs: "PFX", gs: "PFX"\n",
         get_thread_id(), reg_names[seg], sel, os_tls->app_fs_base, os_tls->app_gs_base);
 #elif defined(ARM)
     /* FIXME i#1551: NYI on ARM */
     ASSERT_NOT_REACHED();
 #endif /* X86/ARM */
 }

 /* initialization for mangle_app_seg, must be called before
  * DR setup its own segment.
  */
 static void
 os_tls_app_seg_init(os_local_state_t *os_tls, void *segment)
 {
 #ifdef X86
     int i, index;
     our_modify_ldt_t *desc;
     app_pc app_fs_base, app_gs_base;

     os_tls->app_fs = read_selector(SEG_FS);
     os_tls->app_gs = read_selector(SEG_GS);
     app_fs_base = get_segment_base(SEG_FS);
     app_gs_base = get_segment_base(SEG_GS);
     /* If we're a non-initial thread, fs/gs will be set to the parent's value */
     if (!is_dynamo_address(app_gs_base))
         os_tls->app_gs_base = app_gs_base;
     else
         os_tls->app_gs_base = NULL;
     if (!is_dynamo_address(app_fs_base))
         os_tls->app_fs_base = app_fs_base;
     else
         os_tls->app_fs_base = NULL;

     /* get all TLS thread area value */
     /* XXX: is get_thread_area supported in 64-bit kernel?
      * It has syscall number 211.
      * It works for a 32-bit application running in a 64-bit kernel.
      * It returns error value -38 for a 64-bit app in a 64-bit kernel.
      */
     desc = &os_tls->os_seg_info.app_thread_areas[0];
     tls_initialize_indices(os_tls);
     index = tls_min_index();
     for (i = 0; i < GDT_NUM_TLS_SLOTS; i++) {
         tls_get_descriptor(i + index, &desc[i]);
     }

     os_tls->os_seg_info.dr_fs_base = IF_X64_ELSE(NULL, segment);
     os_tls->os_seg_info.dr_gs_base = IF_X64_ELSE(segment, NULL);
     /* now allocate the tls segment for client libraries */
     if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
 # ifdef X64
         os_tls->os_seg_info.dr_fs_base = privload_tls_init(os_tls->app_fs_base);
 # else
         os_tls->os_seg_info.dr_gs_base = privload_tls_init(os_tls->app_gs_base);
 # endif
     }

     LOG(THREAD_GET, LOG_THREADS, 1, "thread "TIDFMT" app fs: "PFX", gs: "PFX"\n",
         get_thread_id(), os_tls->app_fs_base, os_tls->app_gs_base);
     LOG(THREAD_GET, LOG_THREADS, 1, "thread "TIDFMT" DR fs: "PFX", gs: "PFX"\n",
         get_thread_id(), os_tls->os_seg_info.dr_fs_base, os_tls->os_seg_info.dr_gs_base);
 #elif defined(ARM)
     /* FIXME i#1551: NYI on ARM */
     ASSERT_NOT_IMPLEMENTED(false);
 #endif /* X86/ARM */
 }

 void
 os_tls_init(void)
 {
 #ifdef HAVE_TLS
     /* We create a 1-page segment with an LDT entry for each thread and load its
      * selector into fs/gs.
      * FIXME PR 205276: this whole scheme currently does not check if app is using
      * segments need to watch modify_ldt syscall
      */
     /* FIXME: heap_mmap marks as exec, we just want RW */
     byte *segment = heap_mmap(PAGE_SIZE);
     os_local_state_t *os_tls = (os_local_state_t *) segment;

     LOG(GLOBAL, LOG_THREADS, 1, "os_tls_init for thread "TIDFMT"\n", get_thread_id());

     /* MUST zero out dcontext slot so uninit access gets NULL */
     memset(segment, 0, PAGE_SIZE);
     /* store key data in the tls itself */
     os_tls->self = os_tls;
     os_tls->tid = get_thread_id();
     os_tls->tls_type = TLS_TYPE_NONE;
     /* We save DR's TLS segment base here so that os_get_dr_seg_base() will work
      * even when -no_mangle_app_seg is set.  If -mangle_app_seg is set, this
      * will be overwritten in os_tls_app_seg_init().
      */
     os_tls->os_seg_info.IF_X64_ELSE(dr_gs_base, dr_fs_base) = segment;
     ASSERT(proc_is_cache_aligned(os_tls->self + TLS_LOCAL_STATE_OFFSET));
     /* Verify that local_state_extended_t should indeed be used. */
     ASSERT(DYNAMO_OPTION(ibl_table_in_tls));

     /* get application's GS/FS segment base before being replaced by DR. */
     if (INTERNAL_OPTION(mangle_app_seg))
         os_tls_app_seg_init(os_tls, segment);

     tls_thread_init(os_tls, segment);
     ASSERT(os_tls->tls_type != TLS_TYPE_NONE);
 # ifdef X86
     /* store type in global var for convenience: should be same for all threads */
     tls_global_type = os_tls->tls_type;
 # elif defined(ARM)
     /* FIXME i#1551: NYI on ARM */
     ASSERT_NOT_IMPLEMENTED(false);
 # endif
     /* FIXME: this should be a SYSLOG fatal error?  Should fall back on !HAVE_TLS?
      * Should have create_ldt_entry() return failure instead of asserting, then.
      */
 #else
     tls_table = (tls_slot_t *)
         global_heap_alloc(MAX_THREADS*sizeof(tls_slot_t) HEAPACCT(ACCT_OTHER));
     memset(tls_table, 0, MAX_THREADS*sizeof(tls_slot_t));
 #endif
     ASSERT(is_segment_register_initialized());
 }

 /* Frees local_state.  If the calling thread is exiting (i.e.,
  * !other_thread) then also frees kernel resources for the calling
  * thread; if other_thread then that may not be possible.
  */
 void
 os_tls_exit(local_state_t *local_state, bool other_thread)
 {
 #ifdef HAVE_TLS
 # ifdef X86
     static const ptr_uint_t zero = 0;
 # endif /* X86 */
     /* We can't read from fs: as we can be called from other threads */
     /* ASSUMPTION: local_state_t is laid out at same start as local_state_extended_t */
     os_local_state_t *os_tls = (os_local_state_t *)
         (((byte*)local_state) - offsetof(os_local_state_t, state));
     tls_type_t tls_type = os_tls->tls_type;
     int index = os_tls->ldt_index;

 # ifdef X86
     /* If the MSR is in use, writing to the reg faults.  We rely on it being 0
      * to indicate that.
      */
     if (!other_thread && read_selector(SEG_TLS) != 0) {
         WRITE_DR_SEG(zero); /* macro needs lvalue! */
     }
 # endif /* X86 */
     heap_munmap(os_tls->self, PAGE_SIZE);

     /* For another thread we can't really make these syscalls so we have to
      * leave it un-cleaned-up.  That's fine if the other thread is exiting:
      * but if we have a detach feature (i#95) we'll have to get the other
      * thread to run this code.
      */
     if (!other_thread) {
         tls_thread_free(tls_type, index);
 # ifdef X64
         if (tls_type == TLS_TYPE_ARCH_PRCTL) {
             /* syscall re-sets gs register so re-clear it */
             if (read_selector(SEG_TLS) != 0) {
                 WRITE_DR_SEG(zero); /* macro needs lvalue! */
             }
         }
 # endif
    }
 #else
     global_heap_free(tls_table, MAX_THREADS*sizeof(tls_slot_t) HEAPACCT(ACCT_OTHER));
     DELETE_LOCK(tls_lock);
 #endif
 }

 static int
 os_tls_get_gdt_index(dcontext_t *dcontext)
 {
     os_local_state_t *os_tls = (os_local_state_t *)
         (((byte*)dcontext->local_state) - offsetof(os_local_state_t, state));
     if (os_tls->tls_type == TLS_TYPE_GDT)
         return os_tls->ldt_index;
     else
         return -1;
 }

 void
 os_tls_pre_init(int gdt_index)
 {
 #ifdef X86
     /* Only set to above 0 for tls_type == TLS_TYPE_GDT */
     if (gdt_index > 0) {
         /* PR 458917: clear gdt slot to avoid leak across exec */
         DEBUG_DECLARE(bool ok;)
         static const ptr_uint_t zero = 0;
         /* Be sure to clear the selector before anything that might
          * call get_thread_private_dcontext()
          */
         WRITE_DR_SEG(zero); /* macro needs lvalue! */
         DEBUG_DECLARE(ok = )
             tls_clear_descriptor(gdt_index);
         ASSERT(ok);
     }
 #elif defined(ARM)
     /* FIXME i#1551: NYI on ARM */
     ASSERT_NOT_IMPLEMENTED(false);
 #endif /* X86/ARM */
 }

 #ifdef CLIENT_INTERFACE
 /* Allocates num_slots tls slots aligned with alignment align */
 bool
 os_tls_calloc(OUT uint *offset, uint num_slots, uint alignment)
 {
     bool res = false;
     uint i, count = 0;
     int start = -1;
     uint offs = offsetof(os_local_state_t, client_tls);
     if (num_slots > MAX_NUM_CLIENT_TLS)
         return false;
     mutex_lock(&client_tls_lock);
     for (i = 0; i < MAX_NUM_CLIENT_TLS; i++) {
         if (!client_tls_allocated[i] &&
             /* ALIGNED doesn't work for 0 */
             (alignment == 0 || ALIGNED(offs + i*sizeof(void*), alignment))) {
             if (start == -1)
                 start = i;
             count++;
             if (count >= num_slots)
                 break;
         } else {
             start = -1;
             count = 0;
         }
     }
     if (count >= num_slots) {
         for (i = 0; i < num_slots; i++)
             client_tls_allocated[i + start] = true;
         *offset = offs + start*sizeof(void*);
         res = true;
     }
     mutex_unlock(&client_tls_lock);
     return res;
 }

 bool
 os_tls_cfree(uint offset, uint num_slots)
 {
     uint i;
     uint offs = (offset - offsetof(os_local_state_t, client_tls))/sizeof(void*);
     bool ok = true;
     mutex_lock(&client_tls_lock);
     for (i = 0; i < num_slots; i++) {
         if (!client_tls_allocated[i + offs])
             ok = false;
         client_tls_allocated[i + offs] = false;
     }
     mutex_unlock(&client_tls_lock);
     return ok;
 }
 #endif

 void
 os_thread_init(dcontext_t *dcontext)
 {
     os_local_state_t *os_tls = get_os_tls();
     os_thread_data_t *ostd = (os_thread_data_t *)
         heap_alloc(dcontext, sizeof(os_thread_data_t) HEAPACCT(ACCT_OTHER));
     dcontext->os_field = (void *) ostd;
     /* make sure stack fields, etc. are 0 now so they can be initialized on demand
      * (don't have app esp register handy here to init now)
      */
     memset(ostd, 0, sizeof(*ostd));

     ksynch_init_var(&ostd->suspended);
     ksynch_init_var(&ostd->wakeup);
     ksynch_init_var(&ostd->resumed);
     ksynch_init_var(&ostd->terminated);

 #ifdef RETURN_AFTER_CALL
     if (!dynamo_initialized) {
         /* Find the bottom of the stack of the initial (native) entry */
         ostd->stack_bottom_pc = find_stack_bottom();
         LOG(THREAD, LOG_ALL, 1, "Stack bottom pc = "PFX"\n", ostd->stack_bottom_pc);
     } else {
         /* We only need the stack bottom for the initial thread */
         ostd->stack_bottom_pc = NULL;
     }
 #endif

     ASSIGN_INIT_LOCK_FREE(ostd->suspend_lock, suspend_lock);

     signal_thread_init(dcontext);

     /* i#107, initialize thread area information,
      * the value was first get in os_tls_init and stored in os_tls
      */
     ostd->dr_gs_base = os_tls->os_seg_info.dr_gs_base;
     ostd->dr_fs_base = os_tls->os_seg_info.dr_fs_base;
     if (INTERNAL_OPTION(mangle_app_seg)) {
         ostd->app_thread_areas =
             heap_alloc(dcontext, sizeof(our_modify_ldt_t) * GDT_NUM_TLS_SLOTS
                        HEAPACCT(ACCT_OTHER));
         memcpy(ostd->app_thread_areas,
                os_tls->os_seg_info.app_thread_areas,
                sizeof(our_modify_ldt_t) * GDT_NUM_TLS_SLOTS);
     }

     /* FIXME i#1551: we need a better alias for gs/fs on ARM */
     LOG(THREAD, LOG_THREADS, 1, "cur gs base is "PFX"\n",
         get_segment_base(IF_X86_ELSE(SEG_GS, DR_REG_TPIDRURO)));
     LOG(THREAD, LOG_THREADS, 1, "cur fs base is "PFX"\n",
         get_segment_base(IF_X86_ELSE(SEG_FS, DR_REG_TPIDRURW)));

 #ifdef MACOS
     /* XXX: do we need to free/close dcontext->thread_port?  I don't think so. */
     dcontext->thread_port = dynamorio_mach_syscall(MACH_thread_self_trap, 0);
     LOG(THREAD, LOG_ALL, 1, "Mach thread port: %d\n", dcontext->thread_port);
 #endif
 }

 void
 os_thread_exit(dcontext_t *dcontext, bool other_thread)
 {
     os_thread_data_t *ostd = (os_thread_data_t *) dcontext->os_field;

     /* i#237/PR 498284: if we had a vfork child call execve we need to clean up
      * the env vars.
      */
     if (dcontext->thread_record->execve)
         handle_execve_post(dcontext);

     DELETE_LOCK(ostd->suspend_lock);

     signal_thread_exit(dcontext, other_thread);

     ksynch_free_var(&ostd->suspended);
     ksynch_free_var(&ostd->wakeup);
     ksynch_free_var(&ostd->resumed);
     ksynch_free_var(&ostd->terminated);

     /* for non-debug we do fast exit path and don't free local heap */
     DODEBUG({
         if (INTERNAL_OPTION(mangle_app_seg)) {
             heap_free(dcontext, ostd->app_thread_areas,
                       sizeof(our_modify_ldt_t) * GDT_NUM_TLS_SLOTS
                       HEAPACCT(ACCT_OTHER));
 #ifdef CLIENT_INTERFACE
             if (INTERNAL_OPTION(private_loader)) {
                 privload_tls_exit(IF_X64_ELSE(ostd->dr_fs_base,
                                               ostd->dr_gs_base));
             }
 #endif
         }
         heap_free(dcontext, ostd, sizeof(os_thread_data_t) HEAPACCT(ACCT_OTHER));
     });
 }

 /* Happens in the parent prior to fork. */
 static void
 os_fork_pre(dcontext_t *dcontext)
 {
     os_thread_data_t *ostd = (os_thread_data_t *) dcontext->os_field;

     /* Otherwise a thread might wait for us. */
     ASSERT_OWN_NO_LOCKS();
     ASSERT(ostd->fork_threads == NULL && ostd->fork_num_threads == 0);

     /* i#239: Synch with all other threads to ensure that they are holding no
      * locks across the fork.
      * FIXME i#26: Suspend signals received before initializing siginfo are
      * squelched, so we won't be able to suspend threads that are initializing.
      */
     LOG(GLOBAL, 2, LOG_SYSCALLS|LOG_THREADS,
         "fork: synching with other threads to prevent deadlock in child\n");
     if (!synch_with_all_threads(THREAD_SYNCH_SUSPENDED_VALID_MCONTEXT_OR_NO_XFER,
                                 &ostd->fork_threads,
                                 &ostd->fork_num_threads,
                                 THREAD_SYNCH_VALID_MCONTEXT,
                                 /* If we fail to suspend a thread, there is a
                                  * risk of deadlock in the child, so it's worth
                                  * retrying on failure.
                                  */
                                 THREAD_SYNCH_SUSPEND_FAILURE_RETRY)) {
         /* If we failed to synch with all threads, we live with the possiblity
          * of deadlock and continue as normal.
          */
         LOG(GLOBAL, 1, LOG_SYSCALLS|LOG_THREADS,
             "fork: synch failed, possible deadlock in child\n");
         ASSERT_CURIOSITY(false);
     }

     /* We go back to the code cache to execute the syscall, so we can't hold
      * locks.  If the synch succeeded, no one else is running, so it should be
      * safe to release these locks.  However, if there are any rogue threads,
      * then releasing these locks will allow them to synch and create threads.
      * Such threads could be running due to synch failure or presence of
      * non-suspendable client threads.  We keep our data in ostd to prevent some
      * conflicts, but there are some unhandled corner cases.
      */
     mutex_unlock(&thread_initexit_lock);
     mutex_unlock(&all_threads_synch_lock);
 }

 /* Happens after the fork in both the parent and child. */
 static void
 os_fork_post(dcontext_t *dcontext, bool parent)
 {
     os_thread_data_t *ostd = (os_thread_data_t *) dcontext->os_field;
     /* Re-acquire the locks we released before the fork. */
     mutex_lock(&all_threads_synch_lock);
     mutex_lock(&thread_initexit_lock);
     /* Resume the other threads that we suspended. */
     if (parent) {
         LOG(GLOBAL, 2, LOG_SYSCALLS|LOG_THREADS,
             "fork: resuming other threads after fork\n");
     }
     end_synch_with_all_threads(ostd->fork_threads, ostd->fork_num_threads,
                                parent/*resume in parent, not in child*/);
     ostd->fork_threads = NULL;  /* Freed by end_synch_with_all_threads. */
     ostd->fork_num_threads = 0;
 }

 /* this one is called before child's new logfiles are set up */
 void
 os_fork_init(dcontext_t *dcontext)
 {
     int iter;
     /* We use a larger data size than file_t to avoid clobbering our stack (i#991) */
     ptr_uint_t fd;
     ptr_uint_t flags;

     /* Static assert would save debug build overhead: could use array bound trick */
     ASSERT(sizeof(file_t) <= sizeof(ptr_uint_t));

     /* i#239: If there were unsuspended threads across the fork, we could have
      * forked while another thread held locks.  We reset the locks and try to
      * cope with any intermediate state left behind from the parent.  If we
      * encounter more deadlocks after fork, we can add more lock and data resets
      * on a case by case basis.
      */
     mutex_fork_reset(&all_threads_synch_lock);
     mutex_fork_reset(&thread_initexit_lock);

     os_fork_post(dcontext, false/*!parent*/);

     /* re-populate cached data that contains pid */
     pid_cached = get_process_id();
     get_application_pid_helper(true);
     get_application_name_helper(true, true /* not important */);

     /* close all copies of parent files */
     TABLE_RWLOCK(fd_table, write, lock);
     iter = 0;
     do {
          iter = generic_hash_iterate_next(GLOBAL_DCONTEXT, fd_table, iter,
                                           &fd, (void **)&flags);
          if (iter < 0)
              break;
          if (TEST(OS_OPEN_CLOSE_ON_FORK, flags)) {
              close_syscall((file_t)fd);
              iter = generic_hash_iterate_remove(GLOBAL_DCONTEXT, fd_table,
                                                 iter, fd);
          }
     } while (true);
     TABLE_RWLOCK(fd_table, write, unlock);
 }

 /* We only bother swapping the library segment if we're using the private
  * loader.
  */
 bool
 os_should_swap_state(void)
 {
     /* -private_loader currently implies -mangle_app_seg, but let's be safe. */
     return (INTERNAL_OPTION(mangle_app_seg) &&
             IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false));
 }

 bool
 os_using_app_state(dcontext_t *dcontext)
 {
     /* FIXME: This could be optimized to avoid the syscall by keeping state in
      * the dcontext.
      */
     if (INTERNAL_OPTION(mangle_app_seg)) {
         return (get_segment_base(LIB_SEG_TLS) ==
                 os_get_app_seg_base(dcontext, LIB_SEG_TLS));
     }
     /* We're always in the app state if we're not mangling. */
     return true;
 }

 /* Similar to PEB swapping on Windows, this call will switch between DR's
  * private lib segment base and the app's segment base.
  * i#107: If the app wants to use SEG_TLS, we should also switch that back at
  * this boundary, but there are many places where we simply assume it is always
  * installed.
  */
 void
 os_swap_context(dcontext_t *dcontext, bool to_app, dr_state_flags_t flags)
 {
     if (os_should_swap_state())
         os_switch_seg_to_context(dcontext, LIB_SEG_TLS, to_app);
 }

 void
 os_thread_under_dynamo(dcontext_t *dcontext)
 {
     os_swap_context(dcontext, false/*to dr*/, DR_STATE_ALL);
     start_itimer(dcontext);
 }

 void
 os_thread_not_under_dynamo(dcontext_t *dcontext)
 {
     stop_itimer(dcontext);
     os_swap_context(dcontext, true/*to app*/, DR_STATE_ALL);
 }

 static pid_t
 get_process_group_id()
 {
     return dynamorio_syscall(SYS_getpgid, 0);
 }

 #endif /* !NOT_DYNAMORIO_CORE_PROPER: around most of file, to exclude preload */
 process_id_t
 get_process_id()
 {
     return dynamorio_syscall(SYS_getpid, 0);
 }
 #ifndef NOT_DYNAMORIO_CORE_PROPER /* around most of file, to exclude preload */

 process_id_t
 get_parent_id(void)
 {
     return dynamorio_syscall(SYS_getppid, 0);
 }

 thread_id_t
 get_sys_thread_id(void)
 {
 #ifdef MACOS
     if (kernel_thread_groups)
         return dynamorio_syscall(SYS_thread_selfid, 0);
 #else
     if (kernel_thread_groups)
         return dynamorio_syscall(SYS_gettid, 0);
 #endif
     return dynamorio_syscall(SYS_getpid, 0);
 }

 thread_id_t
 get_thread_id(void)
 {
     /* i#228/PR 494330: making a syscall here is a perf bottleneck since we call
      * this routine in read and recursive locks so use the TLS value instead
      */
     thread_id_t id = get_tls_thread_id();
     if (id != INVALID_THREAD_ID)
         return id;
     else
         return get_sys_thread_id();
 }

 thread_id_t
 get_tls_thread_id(void)
 {
     ptr_int_t tid; /* can't use thread_id_t since it's 32-bits */
     if (!is_segment_register_initialized())
         return INVALID_THREAD_ID;
     READ_TLS_SLOT_IMM(TLS_THREAD_ID_OFFSET, tid);
     /* it reads 8-bytes into the memory, which includes app_gs and app_fs.
      * 0x000000007127357b <get_tls_thread_id+37>:      mov    %gs:(%rax),%rax
      * 0x000000007127357f <get_tls_thread_id+41>:      mov    %rax,-0x8(%rbp)
      * so we remove the TRUNCATE check and trucate it on return.
      */
     return (thread_id_t) tid;
 }

 /* returns the thread-private dcontext pointer for the calling thread */
 dcontext_t*
 get_thread_private_dcontext(void)
 {
 #ifdef HAVE_TLS
     dcontext_t *dcontext;
     /* We have to check this b/c this is called from __errno_location prior
      * to os_tls_init, as well as after os_tls_exit, and early in a new
      * thread's initialization (see comments below on that).
      */
     if (!is_segment_register_initialized())
         return (IF_CLIENT_INTERFACE(standalone_library ? GLOBAL_DCONTEXT :) NULL);
     /* We used to check tid and return NULL to distinguish parent from child, but
      * that was affecting performance (xref PR 207366: but I'm leaving the assert in
      * for now so debug build will still incur it).  So we fixed the cases that
      * needed that:
      *
      * - dynamo_thread_init() calling is_thread_initialized() for a new thread
      *   created via clone or the start/stop interface: so we have
      *   is_thread_initialized() pay the get_thread_id() cost.
      * - new_thread_setup()'s ENTER_DR_HOOK kstats, or a crash and the signal
      *   handler asking about dcontext: we have new_thread_dynamo_start()
      *   clear the segment register for us early on.
      * - child of fork (ASSERT_OWN_NO_LOCKS, etc. on re-entering DR):
      *   here we just suppress the assert: we'll use this same dcontext.
      *   xref PR 209518 where w/o this fix we used to need an extra KSTOP.
      *
      * An alternative would be to have the parent thread clear the segment
      * register, or even set up the child's TLS ahead of time ourselves
      * (and special-case so that we know if at clone syscall the app state is not
      * quite correct: but we're already stealing a register there: PR 286194).
      * We could also have the kernel set up TLS for us (PR 285898).
      *
      * For hotp_only or non-full-control (native_exec, e.g.) (PR 212012), this
      * routine is not the only issue: we have to catch all new threads since
      * hotp_only gateways assume tls is set up.
      * Xref PR 192231.
      */
     /* PR 307698: this assert causes large slowdowns (also xref PR 207366) */
     DOCHECK(CHKLVL_DEFAULT+1, {
         ASSERT(get_tls_thread_id() == get_sys_thread_id() ||
                /* ok for fork as mentioned above */
                pid_cached != get_process_id());
     });
     READ_TLS_SLOT_IMM(TLS_DCONTEXT_OFFSET, dcontext);
     return dcontext;
 #else
     /* Assumption: no lock needed on a read => no race conditions between
      * reading and writing same tid!  Since both get and set are only for
      * the current thread, they cannot both execute simultaneously for the
      * same tid, right?
      */
     thread_id_t tid = get_thread_id();
     int i;
     if (tls_table != NULL) {
         for (i=0; i<MAX_THREADS; i++) {
             if (tls_table[i].tid == tid) {
                 return tls_table[i].dcontext;
             }
         }
     }
     return NULL;
 #endif
 }

 /* sets the thread-private dcontext pointer for the calling thread */
 void
 set_thread_private_dcontext(dcontext_t *dcontext)
 {
 #ifdef HAVE_TLS
     ASSERT(is_segment_register_initialized());
     WRITE_TLS_SLOT_IMM(TLS_DCONTEXT_OFFSET, dcontext);
 #else
     thread_id_t tid = get_thread_id();
     int i;
     bool found = false;
     ASSERT(tls_table != NULL);
     mutex_lock(&tls_lock);
     for (i=0; i<MAX_THREADS; i++) {
         if (tls_table[i].tid == tid) {
             if (dcontext == NULL) {
                 /* if setting to NULL, clear the entire slot for reuse */
                 tls_table[i].tid = 0;
             }
             tls_table[i].dcontext = dcontext;
             found = true;
             break;
         }
     }
     if (!found) {
         if (dcontext == NULL) {
             /* don't do anything...but why would this happen? */
         } else {
             /* look for an empty slot */
             for (i=0; i<MAX_THREADS; i++) {
                 if (tls_table[i].tid == 0) {
                     tls_table[i].tid = tid;
                     tls_table[i].dcontext = dcontext;
                     found = true;
                     break;
                 }
             }
         }
     }
     mutex_unlock(&tls_lock);
     ASSERT(found);
 #endif
 }

 /* replaces old with new
  * use for forking: child should replace parent's id with its own
  */
 static void
 replace_thread_id(thread_id_t old, thread_id_t new)
 {
 #ifdef HAVE_TLS
     ptr_int_t new_tid = new; /* can't use thread_id_t since it's 32-bits */
     ASSERT(is_segment_register_initialized());
     DOCHECK(1, {
         ptr_int_t old_tid; /* can't use thread_id_t since it's 32-bits */
         READ_TLS_SLOT_IMM(TLS_THREAD_ID_OFFSET, old_tid);
         IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint(old_tid)));
         ASSERT(old_tid == old);
     });
     WRITE_TLS_SLOT_IMM(TLS_THREAD_ID_OFFSET, new_tid);
 #else
     int i;
     mutex_lock(&tls_lock);
     for (i=0; i<MAX_THREADS; i++) {
         if (tls_table[i].tid == old) {
             tls_table[i].tid = new;
             break;
         }
     }
     mutex_unlock(&tls_lock);
 #endif
 }

 #endif /* !NOT_DYNAMORIO_CORE_PROPER */

 /* translate permission string to platform independent protection bits */
 uint
 permstr_to_memprot(const char * const perm)
 {
     uint mem_prot = 0;
     if (perm == NULL || *perm == '\0')
         return mem_prot;
     if (perm[2]=='x')
         mem_prot |= MEMPROT_EXEC;
     if (perm[1]=='w')
         mem_prot |= MEMPROT_WRITE;
     if (perm[0]=='r')
         mem_prot |= MEMPROT_READ;
     return mem_prot;
 }

 /* translate platform independent protection bits to native flags */
 uint
 memprot_to_osprot(uint prot)
 {
     uint mmap_prot = 0;
     if (TEST(MEMPROT_EXEC, prot))
         mmap_prot |= PROT_EXEC;
     if (TEST(MEMPROT_READ, prot))
         mmap_prot |= PROT_READ;
     if (TEST(MEMPROT_WRITE, prot))
         mmap_prot |= PROT_WRITE;
     return mmap_prot;
 }

 #ifndef NOT_DYNAMORIO_CORE_PROPER
 /* translate native flags to platform independent protection bits */
 static inline uint
 osprot_to_memprot(uint prot)
 {
     uint mem_prot = 0;
     if (TEST(PROT_EXEC, prot))
         mem_prot |= MEMPROT_EXEC;
     if (TEST(PROT_READ, prot))
         mem_prot |= MEMPROT_READ;
     if (TEST(PROT_WRITE, prot))
         mem_prot |= MEMPROT_WRITE;
     return mem_prot;
 }
 #endif

 /* returns osprot flags preserving all native protection flags except
  * for RWX, which are replaced according to memprot */
 uint
 osprot_replace_memprot(uint old_osprot, uint memprot)
 {
     /* Note only protection flags PROT_ are relevant to mprotect()
      * and they are separate from any other MAP_ flags passed to mmap()
      */
     uint new_osprot = memprot_to_osprot(memprot);
     return new_osprot;
 }

 /* libc independence */
 static inline long
 mprotect_syscall(byte *p, size_t size, uint prot)
 {
     return dynamorio_syscall(SYS_mprotect, 3, p, size, prot);
 }

 bool
 mmap_syscall_succeeded(byte *retval)
 {
     ptr_int_t result = (ptr_int_t) retval;
     /* libc interprets up to -PAGE_SIZE as an error, and you never know if
      * some weird errno will be used by say vmkernel (xref PR 365331)
      */
     bool fail = (result < 0 && result >= -PAGE_SIZE);
     ASSERT_CURIOSITY(!fail ||
                      IF_VMX86(result == -ENOENT ||)
                      IF_VMX86(result == -ENOSPC ||)
                      result == -EBADF   ||
                      result == -EACCES  ||
                      result == -EINVAL  ||
                      result == -ETXTBSY ||
                      result == -EAGAIN  ||
                      result == -ENOMEM  ||
                      result == -ENODEV  ||
                      result == -EFAULT);
     return !fail;
 }

 /* N.B.: offs should be in pages for 32-bit Linux */
 static inline byte *
 mmap_syscall(byte *addr, size_t len, ulong prot, ulong flags, ulong fd, ulong offs)
 {
 #if defined(MACOS) && !defined(X64)
     return (byte *)(ptr_int_t)
         dynamorio_syscall(SYS_mmap, 7, addr, len, prot, flags, fd,
                           /* represent 64-bit arg as 2 32-bit args */
                           offs, 0);
 #else
     return (byte *)(ptr_int_t)
         dynamorio_syscall(IF_MACOS_ELSE(SYS_mmap, IF_X64_ELSE(SYS_mmap, SYS_mmap2)), 6,
                           addr, len, prot, flags, fd, offs);
 #endif
 }

 static inline long
 munmap_syscall(byte *addr, size_t len)
 {
     return dynamorio_syscall(SYS_munmap, 2, addr, len);
 }

 #ifndef NOT_DYNAMORIO_CORE_PROPER
 /* free memory allocated from os_raw_mem_alloc */
 bool
 os_raw_mem_free(void *p, size_t size, uint flags, heap_error_code_t *error_code)
 {
     long rc;
     ASSERT(error_code != NULL);
     ASSERT(size > 0 && ALIGNED(size, PAGE_SIZE));

     rc = munmap_syscall(p, size);
     if (rc != 0) {
         *error_code = -rc;
     } else {
         *error_code = HEAP_ERROR_SUCCESS;
     }
     return (rc == 0);
 }

 /* try to alloc memory at preferred from os directly,
  * caller is required to handle thread synchronization and to update
  */
 void *
 os_raw_mem_alloc(void *preferred, size_t size, uint prot, uint flags,
                  heap_error_code_t *error_code)
 {
     byte *p;
     uint os_prot = memprot_to_osprot(prot);
     uint os_flags = MAP_PRIVATE |
                     MAP_ANONYMOUS |
                     (TEST(RAW_ALLOC_32BIT, flags) ? MAP_32BIT : 0);

     ASSERT(error_code != NULL);
     /* should only be used on aligned pieces */
     ASSERT(size > 0 && ALIGNED(size, PAGE_SIZE));

     p = mmap_syscall(preferred, size, os_prot, os_flags, -1, 0);
     if (!mmap_syscall_succeeded(p)) {
         *error_code = -(heap_error_code_t)(ptr_int_t)p;
         LOG(GLOBAL, LOG_HEAP, 3,
             "os_raw_mem_alloc %d bytes failed"PFX"\n", size, p);
         return NULL;
     }
     if (preferred != NULL && p != preferred) {
         *error_code = HEAP_ERROR_NOT_AT_PREFERRED;
         os_raw_mem_free(p, size, flags, error_code);
         LOG(GLOBAL, LOG_HEAP, 3,
             "os_raw_mem_alloc %d bytes failed"PFX"\n", size, p);
         return NULL;
     }
     LOG(GLOBAL, LOG_HEAP, 2, "os_raw_mem_alloc: "SZFMT" bytes @ "PFX"\n",
         size, p);
     return p;
 }

 #if defined(CLIENT_INTERFACE) && defined(LINUX)
 DR_API
 /* XXX: could add dr_raw_mem_realloc() instead of dr_raw_mremap() -- though there
  * is no realloc for Windows: supposed to reserve yourself and then commit in
  * pieces.
  */
 void *
 dr_raw_mremap(void *old_address, size_t old_size, size_t new_size,
               int flags, void *new_address)
 {
     byte *res;
     dr_mem_info_t info;
     dcontext_t *dcontext = get_thread_private_dcontext();
     /* i#173: we need prot + type from prior to mremap */
     DEBUG_DECLARE(bool ok =)
         query_memory_ex(old_address, &info);
     /* XXX: this could be a large region w/ multiple protection regions
      * inside.  For now we assume our handling of it doesn't care.
      */
     ASSERT(ok);
     if (is_pretend_or_executable_writable(old_address))
         info.prot |= DR_MEMPROT_WRITE;
     /* we just unconditionally send the 5th param */
     res = (byte *) dynamorio_syscall(SYS_mremap, 5, old_address, old_size, new_size,
                                      flags, new_address);
     handle_app_mremap(dcontext, res, new_size, old_address, old_size,
                       info.prot, info.size);
     return res;
 }

 DR_API
 void *
 dr_raw_brk(void *new_address)
 {
     /* We pay the cost of 2 syscalls.  This should be infrequent enough that
      * it doesn't mater.
      */
     if (new_address == NULL) {
         /* Just a query */
         return (void *) dynamorio_syscall(SYS_brk, 1, new_address);
     } else {
         byte *old_brk = (byte *) dynamorio_syscall(SYS_brk, 1, 0);
         byte *res = (byte *) dynamorio_syscall(SYS_brk, 1, new_address);
         dcontext_t *dcontext = get_thread_private_dcontext();
         handle_app_brk(dcontext, old_brk, res);
         return res;
     }
 }
 #endif /* CLIENT_INTERFACE && LINUX */

 /* caller is required to handle thread synchronization and to update dynamo vm areas */
 void
 os_heap_free(void *p, size_t size, heap_error_code_t *error_code)
 {
     long rc;
     ASSERT(error_code != NULL);
     if (!dynamo_exited)
         LOG(GLOBAL, LOG_HEAP, 4, "os_heap_free: %d bytes @ "PFX"\n", size, p);
     rc = munmap_syscall(p, size);
     if (rc != 0) {
         *error_code = -rc;
     } else {
         *error_code = HEAP_ERROR_SUCCESS;
     }
     ASSERT(rc == 0);
 }

 /* reserve virtual address space without committing swap space for it,
    and of course no physical pages since it will never be touched */
 /* to be transparent, we do not use sbrk, and are
  * instead using mmap, and asserting that all os_heap requests are for
  * reasonably large pieces of memory */
 void *
 os_heap_reserve(void *preferred, size_t size, heap_error_code_t *error_code,
                 bool executable)
 {
     void *p;
     uint prot = PROT_NONE;
 #ifdef VMX86_SERVER
     /* PR 365331: we need to be in the mmap_text region for code cache and
      * gencode (PROT_EXEC).
      */
     ASSERT(!os_in_vmkernel_userworld() ||
            !executable || preferred == NULL ||
            ((byte *)preferred >= os_vmk_mmap_text_start() &&
             ((byte *)preferred)+size <= os_vmk_mmap_text_end()));
     /* Note that a preferred address overrides PROT_EXEC and a mmap_data
      * address will be honored, even though any execution there will fault.
      */
     /* FIXME: note that PROT_EXEC => read access, so our guard pages and other
      * non-committed memory, while not writable, is readable.
      * Plus, we can't later clear all prot bits for userworld mmap due to PR 107872
      * (PR 365748 covers fixing this for us).
      * But in most uses we should get our preferred vmheap and shouldn't run
      * out of vmheap, so this should be a corner-case issue.
      */
     if (executable)
         prot = PROT_EXEC;
 #endif
     /* should only be used on aligned pieces */
     ASSERT(size > 0 && ALIGNED(size, PAGE_SIZE));
     ASSERT(error_code != NULL);

     /* FIXME: note that this memory is in fact still committed - see man mmap */
     /* FIXME: case 2347 on Linux or -vm_reserve should be set to false */
     /* FIXME: Need to actually get a mmap-ing with |MAP_NORESERVE */
     p = mmap_syscall(preferred, size, prot, MAP_PRIVATE|MAP_ANONYMOUS
                      IF_X64(| (DYNAMO_OPTION(heap_in_lower_4GB) ?
                                MAP_32BIT : 0)),
                      -1, 0);
     if (!mmap_syscall_succeeded(p)) {
         *error_code = -(heap_error_code_t)(ptr_int_t)p;
         LOG(GLOBAL, LOG_HEAP, 4,
             "os_heap_reserve %d bytes failed "PFX"\n", size, p);
         return NULL;
     } else if (preferred != NULL && p != preferred) {
         /* We didn't get the preferred address.  To harmonize with windows behavior and
          * give greater control we fail the reservation. */
         heap_error_code_t dummy;
         *error_code = HEAP_ERROR_NOT_AT_PREFERRED;
         os_heap_free(p, size, &dummy);
         ASSERT(dummy == HEAP_ERROR_SUCCESS);
         LOG(GLOBAL, LOG_HEAP, 4,
             "os_heap_reserve %d bytes at "PFX" not preferred "PFX"\n",
             size, preferred, p);
         return NULL;
     } else {
         *error_code = HEAP_ERROR_SUCCESS;
     }
     LOG(GLOBAL, LOG_HEAP, 2, "os_heap_reserve: %d bytes @ "PFX"\n", size, p);
 #ifdef VMX86_SERVER
     /* PR 365331: ensure our memory is all in the mmap_text region */
     ASSERT(!os_in_vmkernel_userworld() || !executable ||
            ((byte *)p >= os_vmk_mmap_text_start() &&
             ((byte *)p) + size <= os_vmk_mmap_text_end()));
 #endif
     return p;
 }

 static bool
 find_free_memory_in_region(byte *start, byte *end, size_t size,
                            byte **found_start OUT, byte **found_end OUT)
 {
     memquery_iter_t iter;
     /* XXX: despite /proc/sys/vm/mmap_min_addr == PAGE_SIZE, mmap won't
      * give me that address if I use it as a hint.
      */
     app_pc last_end = (app_pc) (PAGE_SIZE*16);
     bool found = false;
     memquery_iterator_start(&iter, NULL, false/*won't alloc*/);
     while (memquery_iterator_next(&iter)) {
         if (iter.vm_start >= start &&
             MIN(iter.vm_start, end) - MAX(last_end, start) >= size) {
             if (found_start != NULL)
                 *found_start = MAX(last_end, start);
             if (found_end != NULL)
                 *found_end = MIN(iter.vm_start, end);
             found = true;
             break;
         }
         if (iter.vm_start >= end)
             break;
         last_end = iter.vm_end;
     }
     memquery_iterator_stop(&iter);
     return found;
 }

 void *
 os_heap_reserve_in_region(void *start, void *end, size_t size,
                           heap_error_code_t *error_code, bool executable)
 {
     byte *p = NULL;
     byte *try_start = NULL;

     ASSERT(ALIGNED(start, PAGE_SIZE) && ALIGNED(end, PAGE_SIZE));
     ASSERT(ALIGNED(size, PAGE_SIZE));

     LOG(GLOBAL, LOG_HEAP, 3,
         "os_heap_reserve_in_region: "SZFMT" bytes in "PFX"-"PFX"\n", size, start, end);

     /* if no restriction on location use regular os_heap_reserve() */
     if (start == (void *)PTR_UINT_0 && end == (void *)POINTER_MAX)
         return os_heap_reserve(NULL, size, error_code, executable);

     /* loop to handle races */
     while (find_free_memory_in_region(start, end, size, &try_start, NULL)) {
         p = os_heap_reserve(try_start, size, error_code, executable);
         if (p != NULL) {
             ASSERT(*error_code == HEAP_ERROR_SUCCESS);
             ASSERT(p >= (byte *)start && p + size <= (byte *)end);
             break;
         }
     }
     if (p == NULL)
         *error_code = HEAP_ERROR_CANT_RESERVE_IN_REGION;
     else
         *error_code = HEAP_ERROR_SUCCESS;

     LOG(GLOBAL, LOG_HEAP, 2,
         "os_heap_reserve_in_region: reserved "SZFMT" bytes @ "PFX" in "PFX"-"PFX"\n",
         size, p, start, end);
     return p;
 }

 /* commit previously reserved with os_heap_reserve pages */
 /* returns false when out of memory */
 /* A replacement of os_heap_alloc can be constructed by using os_heap_reserve
    and os_heap_commit on a subset of the reserved pages. */
 /* caller is required to handle thread synchronization */
 bool
 os_heap_commit(void *p, size_t size, uint prot, heap_error_code_t *error_code)
 {
     uint os_prot = memprot_to_osprot(prot);
     long res;
     /* should only be used on aligned pieces */
     ASSERT(size > 0 && ALIGNED(size, PAGE_SIZE));
     ASSERT(p);
     ASSERT(error_code != NULL);

     /* FIXME: note that the memory would not be not truly committed if we have */
     /* not actually marked a mmap-ing without MAP_NORESERVE */
     res = mprotect_syscall(p, size, os_prot);
     if (res != 0) {
         *error_code = -res;
         return false;
     } else {
         *error_code = HEAP_ERROR_SUCCESS;
     }

     LOG(GLOBAL, LOG_HEAP, 2, "os_heap_commit: %d bytes @ "PFX"\n", size, p);
     return true;
 }

 /* caller is required to handle thread synchronization and to update dynamo vm areas */
 void
 os_heap_decommit(void *p, size_t size, heap_error_code_t *error_code)
 {
     int rc;
     ASSERT(error_code != NULL);

     if (!dynamo_exited)
         LOG(GLOBAL, LOG_HEAP, 4, "os_heap_decommit: %d bytes @ "PFX"\n", size, p);

     *error_code = HEAP_ERROR_SUCCESS;
     /* FIXME: for now do nothing since os_heap_reserve has in fact committed the memory */
     rc = 0;
     /* TODO:
            p = mmap_syscall(p, size, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
        we should either do a mremap()
        or we can do a munmap() followed 'quickly' by a mmap() -
        also see above the comment that os_heap_reserve() in fact is not so lightweight
     */
     ASSERT(rc == 0);
 }

 bool
 os_heap_systemwide_overcommit(heap_error_code_t last_error_code)
 {
     /* FIXME: conservative answer yes */
     return true;
 }

 bool
 os_heap_get_commit_limit(size_t *commit_used, size_t *commit_limit)
 {
     /* FIXME - NYI */
     return false;
 }

 /* yield the current thread */
 void
 os_thread_yield()
 {
 #ifdef MACOS
     /* XXX i#1291: use raw syscall instead */
     swtch_pri(0);
 #else
     dynamorio_syscall(SYS_sched_yield, 0);
 #endif
 }

 static bool
 thread_signal(process_id_t pid, thread_id_t tid, int signum)
 {
 #ifdef MACOS
     /* FIXME i#58: this takes in a thread port.  Need to map thread id to port.
      * Need to figure out whether we support raw Mach threads w/o pthread on top.
      */
     ASSERT_NOT_IMPLEMENTED(false);
     return false;
 #else
     /* FIXME: for non-NPTL use SYS_kill */
     /* Note that the pid is equivalent to the thread group id.
      * However, we can have threads sharing address space but not pid
      * (if created via CLONE_VM but not CLONE_THREAD), so make sure to
      * use the pid of the target thread, not our pid.
      */
     return (dynamorio_syscall(SYS_tgkill, 3, pid, tid, signum) == 0);
 #endif
 }

 static bool
 known_thread_signal(thread_record_t *tr, int signum)
 {
 #ifdef MACOS
     ptr_int_t res;
     if (tr->dcontext == NULL)
         return FALSE;
     res = dynamorio_syscall(SYS___pthread_kill, 2, tr->dcontext->thread_port, signum);
     LOG(THREAD_GET, LOG_ALL, 3, "%s: signal %d to port %d => %ld\n", __FUNCTION__,
         signum, tr->dcontext->thread_port, res);
     return res == 0;
 #else
     return thread_signal(tr->pid, tr->id, signum);
 #endif
 }

 void
 os_thread_sleep(uint64 milliseconds)
 {
 #ifdef MACOS
     semaphore_t sem = MACH_PORT_NULL;
     int res;
 #else
     struct timespec remain;
     int count = 0;
 #endif
     struct timespec req;
     req.tv_sec = (milliseconds / 1000);
     /* docs say can go up to 1000000000, but doesn't work on FC9 */
     req.tv_nsec = (milliseconds % 1000) * 1000000;
 #ifdef MACOS
     if (sem == MACH_PORT_NULL) {
         DEBUG_DECLARE(kern_return_t res =)
             semaphore_create(mach_task_self(), &sem, SYNC_POLICY_FIFO, 0);
         ASSERT(res == KERN_SUCCESS);
     }
     res = dynamorio_syscall(SYSNUM_NO_CANCEL(SYS___semwait_signal),
                             6, sem, MACH_PORT_NULL, 1, 1,
                             (int64_t)req.tv_sec, (int32_t)req.tv_nsec);
     if (res == -EINTR) {
         /* FIXME i#58: figure out how much time elapsed and re-wait */
     }
 #else
     /* FIXME: if we need accurate sleeps in presence of itimers we should
      * be using SYS_clock_nanosleep w/ an absolute time instead of relative
      */
     while (dynamorio_syscall(SYS_nanosleep, 2, &req, &remain) == -EINTR) {
         /* interrupted by signal or something: finish the interval */
         ASSERT_CURIOSITY_ONCE(remain.tv_sec <= req.tv_sec &&
                               (remain.tv_sec < req.tv_sec ||
                                /* there seems to be some rounding, and sometimes
                                 * remain nsec > req nsec (I've seen 40K diff)
                                 */
                                req.tv_nsec - remain.tv_nsec < 100000 ||
                                req.tv_nsec - remain.tv_nsec > -100000));
         /* not unusual for client threads to use itimers and have their run
          * routine sleep forever
          */
         if (count++ > 3 && !IS_CLIENT_THREAD(get_thread_private_dcontext())) {
             ASSERT_NOT_REACHED();
             break; /* paranoid */
         }
         req = remain;
     }
 #endif
 }

 bool
 os_thread_suspend(thread_record_t *tr)
 {
     os_thread_data_t *ostd = (os_thread_data_t *) tr->dcontext->os_field;
     ASSERT(ostd != NULL);
     /* See synch comments in os_thread_resume: the mutex held there
      * prevents prematurely sending a re-suspend signal.
      */
     mutex_lock(&ostd->suspend_lock);
     ostd->suspend_count++;
     ASSERT(ostd->suspend_count > 0);
     /* If already suspended, do not send another signal.  However, we do
      * need to ensure the target is suspended in case of a race, so we can't
      * just return.
      */
     if (ostd->suspend_count == 1) {
         /* PR 212090: we use a custom signal handler to suspend.  We wait
          * here until the target reaches the suspend point, and leave it
          * up to the caller to check whether it is a safe suspend point,
          * to match Windows behavior.
          */
         ASSERT(ksynch_get_value(&ostd->suspended) == 0);
         if (!known_thread_signal(tr, SUSPEND_SIGNAL)) {
             ostd->suspend_count--;
             mutex_unlock(&ostd->suspend_lock);
             return false;
         }
     }
     /* we can unlock before the wait loop b/c we're using a separate "resumed"
      * int and os_thread_resume holds the lock across its wait.  this way a resume
      * can proceed as soon as the suspended thread is suspended, before the
      * suspending thread gets scheduled again.
      */
     mutex_unlock(&ostd->suspend_lock);
     while (ksynch_get_value(&ostd->suspended) == 0) {
         /* For Linux, waits only if the suspended flag is not set as 1. Return value
          * doesn't matter because the flag will be re-checked.
          */
         ksynch_wait(&ostd->suspended, 0);
         if (ksynch_get_value(&ostd->suspended) == 0) {
             /* If it still has to wait, give up the cpu. */
             os_thread_yield();
         }
     }
     return true;
 }

 bool
 os_thread_resume(thread_record_t *tr)
 {
     os_thread_data_t *ostd = (os_thread_data_t *) tr->dcontext->os_field;
     ASSERT(ostd != NULL);
     /* This mutex prevents sending a re-suspend signal before the target
      * reaches a safe post-resume point from a first suspend signal.
      * Given that race, we can't just use atomic_add_exchange_int +
      * atomic_dec_becomes_zero on suspend_count.
      */
     mutex_lock(&ostd->suspend_lock);
     ASSERT(ostd->suspend_count > 0);
     /* PR 479750: if do get here and target is not suspended then abort
      * to avoid possible deadlocks
      */
     if (ostd->suspend_count == 0) {
         mutex_unlock(&ostd->suspend_lock);
         return true; /* the thread is "resumed", so success status */
     }
     ostd->suspend_count--;
     if (ostd->suspend_count > 0) {
         mutex_unlock(&ostd->suspend_lock);
         return true; /* still suspended */
     }
     ksynch_set_value(&ostd->wakeup, 1);
     ksynch_wake(&ostd->wakeup);
     while (ksynch_get_value(&ostd->resumed) == 0) {
         /* For Linux, waits only if the resumed flag is not set as 1.  Return value
          * doesn't matter because the flag will be re-checked.
          */
         ksynch_wait(&ostd->resumed, 0);
         if (ksynch_get_value(&ostd->resumed) == 0) {
             /* If it still has to wait, give up the cpu. */
             os_thread_yield();
         }
     }
     ksynch_set_value(&ostd->wakeup, 0);
     ksynch_set_value(&ostd->resumed, 0);
     mutex_unlock(&ostd->suspend_lock);
     return true;
 }

 bool
 os_thread_terminate(thread_record_t *tr)
 {
     /* PR 297902: for NPTL sending SIGKILL will take down the whole group:
      * so instead we send SIGUSR2 and have a flag set telling
      * target thread to execute SYS_exit
      */
     os_thread_data_t *ostd = (os_thread_data_t *) tr->dcontext->os_field;
     ASSERT(ostd != NULL);
     ostd->terminate = true;
     return known_thread_signal(tr, SUSPEND_SIGNAL);
 }

 bool
 is_thread_terminated(dcontext_t *dcontext)
 {
     os_thread_data_t *ostd = (os_thread_data_t *) dcontext->os_field;
     ASSERT(ostd != NULL);
     return (ksynch_get_value(&ostd->terminated) == 1);
 }

 void
 os_wait_thread_terminated(dcontext_t *dcontext)
 {
     os_thread_data_t *ostd = (os_thread_data_t *) dcontext->os_field;
     ASSERT(ostd != NULL);
     while (ksynch_get_value(&ostd->terminated) == 0) {
         /* On Linux, waits only if the terminated flag is not set as 1. Return value
          * doesn't matter because the flag will be re-checked.
          */
         ksynch_wait(&ostd->terminated, 0);
         if (ksynch_get_value(&ostd->terminated) == 0) {
             /* If it still has to wait, give up the cpu. */
             os_thread_yield();
         }
     }
 }

 bool
 thread_get_mcontext(thread_record_t *tr, priv_mcontext_t *mc)
 {
     /* PR 212090: only works when target is suspended by us, and
      * we then take the signal context
      */
     os_thread_data_t *ostd = (os_thread_data_t *) tr->dcontext->os_field;
     ASSERT(ostd != NULL);
     ASSERT(ostd->suspend_count > 0);
     if (ostd->suspend_count == 0)
         return false;
     ASSERT(ostd->suspended_sigcxt != NULL);
     sigcontext_to_mcontext(mc, ostd->suspended_sigcxt);
     return true;
 }

 bool
 thread_set_mcontext(thread_record_t *tr, priv_mcontext_t *mc)
 {
     /* PR 212090: only works when target is suspended by us, and
      * we then replace the signal context
      */
     os_thread_data_t *ostd = (os_thread_data_t *) tr->dcontext->os_field;
     ASSERT(ostd != NULL);
     ASSERT(ostd->suspend_count > 0);
     if (ostd->suspend_count == 0)
         return false;
     ASSERT(ostd->suspended_sigcxt != NULL);
     mcontext_to_sigcontext(ostd->suspended_sigcxt, mc);
     return true;
 }

 bool
 is_thread_currently_native(thread_record_t *tr)
 {
     return (!tr->under_dynamo_control ||
             /* start/stop doesn't change under_dynamo_control and has its own field */
             (tr->dcontext != NULL && tr->dcontext->currently_stopped));
 }

 #ifdef CLIENT_SIDELINE /* PR 222812: tied to sideline usage */
 # ifdef LINUX /* XXX i#58: just until we have Mac support */
 static void
 client_thread_run(void)
 {
     void (*func)(void *param);
     dcontext_t *dcontext;
     byte *xsp;
     GET_STACK_PTR(xsp);
     void *crec = get_clone_record((reg_t)xsp);
     IF_DEBUG(int rc = )
         dynamo_thread_init(get_clone_record_dstack(crec), NULL, true);
     ASSERT(rc != -1); /* this better be a new thread */
     dcontext = get_thread_private_dcontext();
     ASSERT(dcontext != NULL);
     LOG(THREAD, LOG_ALL, 1, "\n***** CLIENT THREAD %d *****\n\n",
         get_thread_id());
     /* We stored the func and args in particular clone record fields */
     func = (void (*)(void *param)) signal_thread_inherit(dcontext, crec);
     void *arg = (void *) get_clone_record_app_xsp(crec);
     LOG(THREAD, LOG_ALL, 1, "func="PFX", arg="PFX"\n", func, arg);

     (*func)(arg);

     LOG(THREAD, LOG_ALL, 1, "\n***** CLIENT THREAD %d EXITING *****\n\n",
         get_thread_id());
     cleanup_and_terminate(dcontext, SYS_exit, 0, 0, false/*just thread*/,
                           IF_MACOS_ELSE(dcontext->thread_port, 0), 0);
 }
 # endif

 /* i#41/PR 222812: client threads
  * * thread must have dcontext since many API routines require one and we
  *   don't expose GLOBAL_DCONTEXT (xref PR 243008, PR 216936, PR 536058)
  * * reversed the old design of not using dstack (partly b/c want dcontext)
  *   and I'm using the same parent-creates-dstack and clone_record_t design
  *   to create linux threads: dstack should be big enough for client threads
  *   (xref PR 202669)
  * * reversed the old design of explicit dr_terminate_client_thread(): now
  *   the thread is auto-terminated and stack cleaned up on return from run
  *   function
  */
 DR_API bool
 dr_create_client_thread(void (*func)(void *param), void *arg)
 {
 #ifdef LINUX
     dcontext_t *dcontext = get_thread_private_dcontext();
     byte *xsp;
     /* We do not pass SIGCHLD since don't want signal to parent and don't support
      * waiting on child.
      * We do not pass CLONE_THREAD so that the new thread is in its own thread
      * group, allowing it to have private itimers and not receive any signals
      * sent to the app's thread groups.  It also makes the thread not show up in
      * the thread list for the app, making it more invisible.
      */
     uint flags = CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND
         IF_NOT_X64(| CLONE_SETTLS)
         /* CLONE_THREAD required.  Signals and itimers are private anyway. */
         IF_VMX86(| (os_in_vmkernel_userworld() ? CLONE_THREAD : 0));
     pre_second_thread();
     /* need to share signal handler table, prior to creating clone record */
     handle_clone(dcontext, flags);
     void *crec = create_clone_record(dcontext, (reg_t*)&xsp);
     /* make sure client_thread_run can get the func and arg, and that
      * signal_thread_inherit gets the right syscall info
      */
     set_clone_record_fields(crec, (reg_t) arg, (app_pc) func, SYS_clone, flags);
     /* i#501 switch to app's tls before creating client thread */
     if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false))
         os_switch_lib_tls(dcontext, true/*to app*/);
 # if defined(X86) && !defined(X64)
     /* For the TCB we simply share the parent's.  On Linux we could just inherit
      * the same selector but not for VMX86_SERVER so we specify for both for
      * 32-bit.  Most of the fields are pthreads-specific and we assume the ones
      * that will be used (such as tcbhead_t.sysinfo @0x10) are read-only.
      */
     our_modify_ldt_t desc;
     /* if get_segment_base() returned size too we could use it */
     uint index = tls_priv_lib_index();
     ASSERT(index != -1);
     if (!tls_get_descriptor(index, &desc)) {
         LOG(THREAD, LOG_ALL, 1,
             "%s: client thread tls get entry %d failed\n", __FUNCTION__, index);
         return false;
     }
 # endif
     LOG(THREAD, LOG_ALL, 1, "dr_create_client_thread xsp="PFX" dstack="PFX"\n",
         xsp, get_clone_record_dstack(crec));
     thread_id_t newpid = dynamorio_clone(flags, xsp, NULL,
                                          IF_ARM_ELSE(NULL, IF_X64_ELSE(NULL, &desc)),
                                          NULL, client_thread_run);
     /* i#501 switch to app's tls before creating client thread */
     if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false))
         os_switch_lib_tls(dcontext, false/*to dr*/);
     if (newpid < 0) {
         LOG(THREAD, LOG_ALL, 1, "client thread creation failed: %d\n", newpid);
         return false;
     } else if (newpid == 0) {
         /* dynamorio_clone() should have called client_thread_run directly */
         ASSERT_NOT_REACHED();
         return false;
     }
     return true;
 #else
     ASSERT_NOT_IMPLEMENTED(false); /* FIXME i#58: implement on Mac */
     return false;
 #endif
 }
 #endif /* CLIENT_SIDELINE PR 222812: tied to sideline usage */

 int
 get_num_processors(void)
 {
     static uint num_cpu = 0;         /* cached value */
     if (!num_cpu) {
 #ifdef MACOS
         DEBUG_DECLARE(bool ok =)
             sysctl_query(CTL_HW, HW_NCPU, &num_cpu, sizeof(num_cpu));
         ASSERT(ok);
 #else
         /* We used to use get_nprocs_conf, but that's in libc, so now we just
          * look at the /sys filesystem ourselves, which is what glibc does.
          */
         uint local_num_cpus = 0;
         file_t cpu_dir = os_open_directory("/sys/devices/system/cpu",
                                            OS_OPEN_READ);
         dir_iterator_t iter;
         os_dir_iterator_start(&iter, cpu_dir);
         while (os_dir_iterator_next(&iter)) {
             int dummy_num;
             if (sscanf(iter.name, "cpu%d", &dummy_num) == 1)
                 local_num_cpus++;
         }
         os_close(cpu_dir);
         num_cpu = local_num_cpus;
 #endif
         ASSERT(num_cpu);
     }
     return num_cpu;
 }

 /* i#46: To support -no_private_loader, we have to call the dlfcn family of
  * routines in libdl.so.  When we do early injection, there is no loader to
  * resolve these imports, so they will crash.  Early injection is incompatible
  * with -no_private_loader, so this should never happen.
  */

 #if defined(CLIENT_INTERFACE) || defined(HOT_PATCHING_INTERFACE)
 shlib_handle_t
 load_shared_library(const char *name, bool reachable)
 {
 # ifdef STATIC_LIBRARY
     if (os_files_same(name, get_application_name())) {
         /* The private loader falls back to dlsym() and friends for modules it
          * does't recognize, so this works without disabling the private loader.
          */
         return dlopen(NULL, RTLD_LAZY);  /* Gets a handle to the exe. */
     }
 # endif
     /* We call locate_and_load_private_library() to support searching for
      * a pathless name.
      */
     if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false))
         return (shlib_handle_t) locate_and_load_private_library(name, reachable);
     ASSERT(!DYNAMO_OPTION(early_inject));
     return dlopen(name, RTLD_LAZY);
 }
 #endif

 #if defined(CLIENT_INTERFACE)
 shlib_routine_ptr_t
 lookup_library_routine(shlib_handle_t lib, const char *name)
 {
     if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
         return (shlib_routine_ptr_t)
             get_private_library_address((app_pc)lib, name);
     }
     ASSERT(!DYNAMO_OPTION(early_inject));
     return dlsym(lib, name);
 }

 void
 unload_shared_library(shlib_handle_t lib)
 {
     if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
         unload_private_library(lib);
     } else {
         ASSERT(!DYNAMO_OPTION(early_inject));
         if (!DYNAMO_OPTION(avoid_dlclose)) {
             dlclose(lib);
         }
     }
 }

 void
 shared_library_error(char *buf, int maxlen)
 {
     const char *err;
     if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
         err = "error in private loader";
     } else {
         ASSERT(!DYNAMO_OPTION(early_inject));
         err = dlerror();
         if (err == NULL) {
             err = "dlerror returned NULL";
         }
     }
     strncpy(buf, err, maxlen-1);
     buf[maxlen-1] = '\0'; /* strncpy won't put on trailing null if maxes out */
 }

 /* addr is any pointer known to lie within the library.
  * for linux, one of addr or name is needed; for windows, neither is needed.
  */
 bool
 shared_library_bounds(IN shlib_handle_t lib, IN byte *addr,
                       IN const char *name,
                       OUT byte **start, OUT byte **end)
 {
     ASSERT(start != NULL && end != NULL);
     /* PR 366195: dlopen() handle truly is opaque, so we have to use either
      * addr or name
      */
     ASSERT(addr != NULL || name != NULL);
     *start = addr;
     if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
         privmod_t *mod;
         /* look for private library first */
         acquire_recursive_lock(&privload_lock);
         mod = privload_lookup_by_base((app_pc)lib);
         if (name != NULL && mod == NULL)
             mod = privload_lookup(name);
         if (mod != NULL && !mod->externally_loaded) {
             *start = mod->base;
             if (end != NULL)
                 *end = mod->base + mod->size;
             release_recursive_lock(&privload_lock);
             return true;
         }
         release_recursive_lock(&privload_lock);
     }
     return (memquery_library_bounds(name, start, end, NULL, 0) > 0);
 }
 #endif /* defined(CLIENT_INTERFACE) */

 #endif /* !NOT_DYNAMORIO_CORE_PROPER: around most of file, to exclude preload */

 /* FIXME - not available in 2.0 or earlier kernels, not really an issue since no one
  * should be running anything that old. */
 int
 llseek_syscall(int fd, int64 offset, int origin, int64 *result)
 {
 #if defined(X64) || defined(MACOS)
 # ifndef X64
     /* 2 slots for 64-bit arg */
     *result = dynamorio_syscall(SYS_lseek, 4, fd, (uint)(offset & 0xFFFFFFFF),
                                 (uint)((offset >> 32) & 0xFFFFFFFF), origin);
 # else
     *result = dynamorio_syscall(SYS_lseek, 3, fd, offset, origin);
 # endif
     return ((*result > 0) ? 0 : (int)*result);
 #else
     return dynamorio_syscall(SYS__llseek, 5, fd, (uint)((offset >> 32) & 0xFFFFFFFF),
                              (uint)(offset & 0xFFFFFFFF), result, origin);
 #endif
 }

 bool
 os_file_exists(const char *fname, bool is_dir)
 {
     /* _LARGEFILE64_SOURCE should make libc struct match kernel (see top of file) */
     struct stat64 st;
     ptr_int_t res = dynamorio_syscall(SYSNUM_STAT, 2, fname, &st);
     if (res != 0) {
         LOG(THREAD_GET, LOG_SYSCALLS, 2, "%s failed: "PIFX"\n", __func__, res);
         return false;
     }
     return (!is_dir || S_ISDIR(st.st_mode));
 }

 /* Returns true if two paths point to the same file.  Follows symlinks.
  */
 bool
 os_files_same(const char *path1, const char *path2)
 {
     struct stat64 st1, st2;
     ptr_int_t res = dynamorio_syscall(SYSNUM_STAT, 2, path1, &st1);
     if (res != 0) {
         LOG(THREAD_GET, LOG_SYSCALLS, 2, "%s failed: "PIFX"\n", __func__, res);
         return false;
     }
     res = dynamorio_syscall(SYSNUM_STAT, 2, path2, &st2);
     if (res != 0) {
         LOG(THREAD_GET, LOG_SYSCALLS, 2, "%s failed: "PIFX"\n", __func__, res);
         return false;
     }
     return st1.st_ino == st2.st_ino;
 }

 bool
 os_get_file_size(const char *file, uint64 *size)
 {
     /* _LARGEFILE64_SOURCE should make libc struct match kernel (see top of file) */
     struct stat64 st;
     ptr_int_t res = dynamorio_syscall(SYSNUM_STAT, 2, file, &st);
     if (res != 0) {
         LOG(THREAD_GET, LOG_SYSCALLS, 2, "%s failed: "PIFX"\n", __func__, res);
         return false;
     }
     ASSERT(size != NULL);
     *size = st.st_size;
     return true;
 }

 bool
 os_get_file_size_by_handle(file_t fd, uint64 *size)
 {
     /* _LARGEFILE64_SOURCE should make libc struct match kernel (see top of file) */
     struct stat64 st;
     ptr_int_t res = dynamorio_syscall(SYSNUM_FSTAT, 2, fd, &st);
     if (res != 0) {
         LOG(THREAD_GET, LOG_SYSCALLS, 2, "%s failed: "PIFX"\n", __func__, res);
         return false;
     }
     ASSERT(size != NULL);
     *size = st.st_size;
     return true;
 }

 /* created directory will be owned by effective uid,
  * Note a symbolic link will never be followed.
  */
 bool
 os_create_dir(const char *fname, create_directory_flags_t create_dir_flags)
 {
     bool require_new = TEST(CREATE_DIR_REQUIRE_NEW, create_dir_flags);
     int rc = dynamorio_syscall(SYS_mkdir, 2, fname, S_IRWXU|S_IRWXG);
     ASSERT(create_dir_flags == CREATE_DIR_REQUIRE_NEW ||
            create_dir_flags == CREATE_DIR_ALLOW_EXISTING);
     return (rc == 0 || (!require_new && rc == -EEXIST));
 }

 bool
 os_delete_dir(const char *name)
 {
     return (dynamorio_syscall(SYS_rmdir, 1, name) == 0);
 }

 int
 open_syscall(const char *file, int flags, int mode)
 {
     ASSERT(file != NULL);
     return dynamorio_syscall(SYSNUM_NO_CANCEL(SYS_open), 3, file, flags, mode);
 }

 int
 close_syscall(int fd)
 {
     return dynamorio_syscall(SYSNUM_NO_CANCEL(SYS_close), 1, fd);
 }

 int
 dup_syscall(int fd)
 {
     return dynamorio_syscall(SYS_dup, 1, fd);
 }

 ssize_t
 read_syscall(int fd, void *buf, size_t nbytes)
 {
     return dynamorio_syscall(SYSNUM_NO_CANCEL(SYS_read), 3, fd, buf, nbytes);
 }

 ssize_t
 write_syscall(int fd, const void *buf, size_t nbytes)
 {
     return dynamorio_syscall(SYSNUM_NO_CANCEL(SYS_write), 3, fd, buf, nbytes);
 }

 #ifndef NOT_DYNAMORIO_CORE_PROPER
 static int
 fcntl_syscall(int fd, int cmd, long arg)
 {
     return dynamorio_syscall(SYSNUM_NO_CANCEL(SYS_fcntl), 3, fd, cmd, arg);
 }
 #endif /* !NOT_DYNAMORIO_CORE_PROPER */

 /* not easily accessible in header files */
 #ifndef O_LARGEFILE
 #  ifdef X64
 /* not needed */
 #    define O_LARGEFILE    0
 #  else
 #    define O_LARGEFILE    0100000
 #  endif
 #endif

 /* we assume that opening for writing wants to create file.
  * we also assume that nobody calling this is creating a persistent
  * file: for that, use os_open_protected() to avoid leaking on exec
  * and to separate from the app's files.
  */
 file_t
 os_open(const char *fname, int os_open_flags)
 {
     int res;
     int flags = 0;
     if (TEST(OS_OPEN_ALLOW_LARGE, os_open_flags))
         flags |= O_LARGEFILE;
     if (!TEST(OS_OPEN_WRITE, os_open_flags))
         res = open_syscall(fname, flags|O_RDONLY, 0);
     else {
         res = open_syscall(fname, flags|O_RDWR|O_CREAT|
                            (TEST(OS_OPEN_APPEND, os_open_flags) ?
                             /* Currently we only support either appending
                              * or truncating, just like Windows and the client
                              * interface.  If we end up w/ a use case that wants
                              * neither it could open append and then seek; if we do
                              * add OS_TRUNCATE or sthg we'll need to add it to
                              * any current writers who don't set OS_OPEN_REQUIRE_NEW.
                              */
                             O_APPEND : O_TRUNC) |
                            (TEST(OS_OPEN_REQUIRE_NEW, os_open_flags) ?
                             O_EXCL : 0),
                            S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP);
     }
     if (res < 0)
         return INVALID_FILE;

     return res;
 }

 file_t
 os_open_directory(const char *fname, int os_open_flags)
 {
     /* no special handling */
     return os_open(fname, os_open_flags);
 }

 void
 os_close(file_t f)
 {
     close_syscall(f);
 }

 #ifndef NOT_DYNAMORIO_CORE_PROPER
 /* dups curfd to a private fd.
  * returns -1 if unsuccessful.
  */
 file_t
 fd_priv_dup(file_t curfd)
 {
     file_t newfd = -1;
     if (DYNAMO_OPTION(steal_fds) > 0) {
         /* RLIMIT_NOFILES is 1 greater than max and F_DUPFD starts at given value */
         /* XXX: if > linux 2.6.24, can use F_DUPFD_CLOEXEC to avoid later call:
          * so how do we tell if the flag is supported?  try calling once at init?
          */
         newfd = fcntl_syscall(curfd, F_DUPFD, app_rlimit_nofile.rlim_cur);
         if (newfd < 0) {
             /* We probably ran out of fds, esp if debug build and there are
              * lots of threads.  Should we track how many we've given out to
              * avoid a failed syscall every time after?
              */
             SYSLOG_INTERNAL_WARNING_ONCE("ran out of stolen fd space");
             /* Try again but this time in the app space, somewhere high up
              * to avoid issues like tcsh assuming it can own fds 3-5 for
              * piping std{in,out,err} (xref the old -open_tcsh_fds option).
              */
             newfd = fcntl_syscall(curfd, F_DUPFD, app_rlimit_nofile.rlim_cur/2);
         }
     }
     return newfd;
 }

 bool
 fd_mark_close_on_exec(file_t fd)
 {
     /* we assume FD_CLOEXEC is the only flag and don't bother w/ F_GETFD */
     if (fcntl_syscall(fd, F_SETFD, FD_CLOEXEC) != 0) {
         SYSLOG_INTERNAL_WARNING("unable to mark file %d as close-on-exec", fd);
         return false;
     }
     return true;
 }

 void
 fd_table_add(file_t fd, uint flags)
 {
     if (fd_table != NULL) {
         TABLE_RWLOCK(fd_table, write, lock);
         DODEBUG({
             /* i#1010: If the fd is already in the table, chances are it's a
              * stale logfile fd left behind by a vforked or cloned child that
              * called execve.  Avoid an assert if that happens.
              */
             bool present = generic_hash_remove(GLOBAL_DCONTEXT, fd_table,
                                                (ptr_uint_t)fd);
             ASSERT_CURIOSITY_ONCE(!present && "stale fd not cleaned up");
         });
         generic_hash_add(GLOBAL_DCONTEXT, fd_table, (ptr_uint_t)fd,
                          /* store the flags, w/ a set bit to ensure not 0 */
                          (void *)(ptr_uint_t)(flags|OS_OPEN_RESERVED));
         TABLE_RWLOCK(fd_table, write, unlock);
     } else {
 #ifdef DEBUG
         static int num_pre_heap;
         num_pre_heap++;
         /* we add main_logfile in os_init() */
         ASSERT(num_pre_heap == 1 && "only main_logfile should come here");
 #endif
     }
 }

 static bool
 fd_is_dr_owned(file_t fd)
 {
     ptr_uint_t flags;
     ASSERT(fd_table != NULL);
     TABLE_RWLOCK(fd_table, read, lock);
     flags = (ptr_uint_t) generic_hash_lookup(GLOBAL_DCONTEXT, fd_table, (ptr_uint_t)fd);
     TABLE_RWLOCK(fd_table, read, unlock);
     return (flags != 0);
 }

 static bool
 fd_is_in_private_range(file_t fd)
 {
     return (DYNAMO_OPTION(steal_fds) > 0 &&
             app_rlimit_nofile.rlim_cur > 0 &&
             fd >= app_rlimit_nofile.rlim_cur);
 }

 file_t
 os_open_protected(const char *fname, int os_open_flags)
 {
     file_t dup;
     file_t res = os_open(fname, os_open_flags);
     if (res < 0)
         return res;

     /* we could have os_open() always switch to a private fd but it's probably
      * not worth the extra syscall for temporary open/close sequences so we
      * only use it for persistent files
      */
     dup = fd_priv_dup(res);
     if (dup >= 0) {
         close_syscall(res);
         res = dup;
         fd_mark_close_on_exec(res);
     } /* else just keep original */

     /* ditto here, plus for things like config.c opening files we can't handle
      * grabbing locks and often don't have heap available so no fd_table
      */
     fd_table_add(res, os_open_flags);

     return res;
 }

 void
 os_close_protected(file_t f)
 {
     ASSERT(fd_table != NULL || dynamo_exited);
     if (fd_table != NULL) {
         TABLE_RWLOCK(fd_table, write, lock);
         generic_hash_remove(GLOBAL_DCONTEXT, fd_table, (ptr_uint_t)f);
         TABLE_RWLOCK(fd_table, write, unlock);
     }
     os_close(f);
 }

 bool
 os_get_current_dir(char *buf, size_t bufsz)
 {
 # ifdef MACOS
     static char noheap_buf[MAXPATHLEN];
     bool res = false;
     file_t fd = os_open(".", OS_OPEN_READ);
     int len;
     /* F_GETPATH assumes a buffer of size MAXPATHLEN */
     char *fcntl_buf;
     if (dynamo_heap_initialized)
         fcntl_buf = global_heap_alloc(MAXPATHLEN HEAPACCT(ACCT_OTHER));
     else
         fcntl_buf = noheap_buf;
     if (fd == INVALID_FILE)
         goto cwd_error;
     if (fcntl_syscall(fd, F_GETPATH, (long)fcntl_buf) != 0)
         goto cwd_error;
     len = snprintf(buf, bufsz, "%s", fcntl_buf);
     buf[bufsz-1] = '\0';
     return (len > 0 && len < bufsz);
  cwd_error:
     if (dynamo_heap_initialized)
         global_heap_free(fcntl_buf, MAXPATHLEN HEAPACCT(ACCT_OTHER));
     os_close(fd);
     return res;
 # else
     return (dynamorio_syscall(SYS_getcwd, 2, buf, bufsz) > 0);
 # endif
 }
 #endif /* !NOT_DYNAMORIO_CORE_PROPER */

 #ifndef NOT_DYNAMORIO_CORE_PROPER /* so drinject can use drdecode's copy */
 ssize_t
 os_write(file_t f, const void *buf, size_t count)
 {
     return write_syscall(f, buf, count);
 }
 #endif /* !NOT_DYNAMORIO_CORE_PROPER */

 ssize_t
 os_read(file_t f, void *buf, size_t count)
 {
     return read_syscall(f, buf, count);
 }

 void
 os_flush(file_t f)
 {
     /* we're not using FILE*, so there is no buffering */
 }

 /* seek the current file position to offset bytes from origin, return true if successful */
 bool
 os_seek(file_t f, int64 offset, int origin)
 {
     int64 result;
     int ret = 0;

     ret = llseek_syscall(f, offset, origin, &result);

     return (ret == 0);
 }

 /* return the current file position, -1 on failure */
 int64
 os_tell(file_t f)
 {
     int64 result = -1;
     int ret = 0;

     ret = llseek_syscall(f, 0, SEEK_CUR, &result);

     if (ret != 0)
         return -1;

     return result;
 }

 bool
 os_delete_file(const char *name)
 {
     return (dynamorio_syscall(SYS_unlink, 1, name) == 0);
 }

 bool
 os_rename_file(const char *orig_name, const char *new_name, bool replace)
 {
     ptr_int_t res;
     if (!replace) {
         /* SYS_rename replaces so we must test beforehand => could have race */
         /* _LARGEFILE64_SOURCE should make libc struct match kernel (see top of file) */
         struct stat64 st;
         ptr_int_t res = dynamorio_syscall(SYSNUM_STAT, 2, new_name, &st);
         if (res == 0)
             return false;
         else if (res != -ENOENT) {
             LOG(THREAD_GET, LOG_SYSCALLS, 2, "%s stat failed: "PIFX"\n", __func__, res);
             return false;
         }
     }
     res = dynamorio_syscall(SYS_rename, 2, orig_name, new_name);
     if (res != 0)
         LOG(THREAD_GET, LOG_SYSCALLS, 2, "%s \"%s\" to \"%s\" failed: "PIFX"\n",
             __func__, orig_name, new_name, res);
     return (res == 0);
 }

 bool
 os_delete_mapped_file(const char *filename)
 {
     return os_delete_file(filename);
 }

 byte *
 os_map_file(file_t f, size_t *size INOUT, uint64 offs, app_pc addr, uint prot,
             map_flags_t map_flags)
 {
     int flags;
     byte *map;
 #if defined(X64) && !defined(NOT_DYNAMORIO_CORE_PROPER)
     bool loop = false;
     uint iters = 0;
 #   define MAX_MMAP_LOOP_ITERS 100
     byte *region_start = NULL, *region_end = NULL;
 #else
     uint pg_offs;
     ASSERT_TRUNCATE(pg_offs, uint, offs / PAGE_SIZE);
     pg_offs = (uint) (offs / PAGE_SIZE);
 #endif
 #ifdef VMX86_SERVER
     flags = MAP_PRIVATE; /* MAP_SHARED not supported yet */
 #else
     flags = TEST(MAP_FILE_COPY_ON_WRITE, map_flags) ? MAP_PRIVATE : MAP_SHARED;
 #endif
 #if defined(X64) && !defined(NOT_DYNAMORIO_CORE_PROPER)
     /* Allocate memory from reachable range for image: or anything (pcache
      * in particular): for low 4GB, easiest to just pass MAP_32BIT (which is
      * low 2GB, but good enough).
      */
     if (DYNAMO_OPTION(heap_in_lower_4GB) && !TEST(MAP_FILE_FIXED, map_flags))
         flags |= MAP_32BIT;
 #endif
     /* Allows memory request instead of mapping a file,
      * so we can request memory from a particular address with fixed argument */
     if (f == -1)
         flags |= MAP_ANONYMOUS;
     if (TEST(MAP_FILE_FIXED, map_flags))
         flags |= MAP_FIXED;
     /* Reachability is not supported for drinjectlib */
 #if defined(X64) && !defined(NOT_DYNAMORIO_CORE_PROPER)
     if (!TEST(MAP_32BIT, flags) && TEST(MAP_FILE_REACHABLE, map_flags)) {
         vmcode_get_reachable_region(&region_start, &region_end);
         /* addr need not be NULL: we'll use it if it's in the region */
         ASSERT(!TEST(MAP_FILE_FIXED, map_flags));
         /* Loop to handle races */
         loop = true;
     }
     while (!loop ||
            (addr != NULL && addr >= region_start && addr+*size <= region_end) ||
            find_free_memory_in_region(region_start, region_end, *size, &addr, NULL)) {
 #endif
         map = mmap_syscall(addr, *size, memprot_to_osprot(prot),
                            flags, f,
                            /* x86 Linux mmap uses offset in pages */
                            IF_LINUX_ELSE(IF_X64_ELSE(offs, pg_offs), offs));
         if (!mmap_syscall_succeeded(map)) {
             LOG(THREAD_GET, LOG_SYSCALLS, 2, "%s failed: "PIFX"\n",
                 __func__, map);
             map = NULL;
         }
 #if defined(X64) && !defined(NOT_DYNAMORIO_CORE_PROPER)
         else if (loop && (map < region_start || map+*size > region_end)) {
             /* Try again: probably a race.  Hopefully our notion of "there's a free
              * region big enough" matches the kernel's, else we'll loop forever
              * (which we try to catch w/ a max iters count).
              */
             munmap_syscall(map, *size);
             map = NULL;
         } else
             break;
         if (!loop)
             break;
         if (++iters > MAX_MMAP_LOOP_ITERS) {
             ASSERT_NOT_REACHED();
             map = NULL;
             break;
         }
         addr = NULL; /* pick a new one */
     }
 #endif
     return map;
 }

 bool
 os_unmap_file(byte *map, size_t size)
 {
     long res = munmap_syscall(map, size);
     return (res == 0);
 }

 /* around most of file, to exclude preload */
 #if !defined(NOT_DYNAMORIO_CORE_PROPER) || defined(STANDALONE_UNIT_TEST)

 bool
 os_get_disk_free_space(/*IN*/ file_t file_handle,
                        /*OUT*/ uint64 *AvailableQuotaBytes /*OPTIONAL*/,
                        /*OUT*/ uint64 *TotalQuotaBytes /*OPTIONAL*/,
                        /*OUT*/ uint64 *TotalVolumeBytes /*OPTIONAL*/)
 {
     /* libc struct seems to match kernel's */
     struct statfs stat;
     ptr_int_t res = dynamorio_syscall(SYS_fstatfs, 2, file_handle, &stat);
     if (res != 0) {
         LOG(THREAD_GET, LOG_SYSCALLS, 2, "%s failed: "PIFX"\n", __func__, res);
         return false;
     }
     LOG(GLOBAL, LOG_STATS, 3,
         "os_get_disk_free_space: avail="SZFMT", free="SZFMT", bsize="SZFMT"\n",
         stat.f_bavail, stat.f_bfree, stat.f_bsize);
     if (AvailableQuotaBytes != NULL)
         *AvailableQuotaBytes = ((uint64)stat.f_bavail * stat.f_bsize);
     /* no support for quotas */
     if (TotalQuotaBytes != NULL)
         *TotalQuotaBytes = ((uint64)stat.f_bavail * stat.f_bsize);
     if (TotalVolumeBytes != NULL) /* despite name this is how much is free */
         *TotalVolumeBytes = ((uint64)stat.f_bfree * stat.f_bsize);
     return true;
 }

 void
 exit_process_syscall(long status)
 {
     /* We now assume SYS_exit_group is defined: not building on old machines,
      * but will execute there.  We try exit_group and if it fails we use exit.
      *
      * FIXME: if no exit_group, kill all other threads (==processes in same addr
      * space) manually?  Presumably we got here b/c at an unsafe point to do
      * full exit?  Or is that not true: what about dr_abort()?
      */
     dynamorio_syscall(SYSNUM_EXIT_PROCESS, 1, status);
     /* would assert that result is -ENOSYS but assert likely calls us => infinite loop */
     exit_thread_syscall(status);
     ASSERT_NOT_REACHED();
 }

 void
 exit_thread_syscall(long status)
 {
 #ifdef MACOS
     mach_port_t thread_port = dynamorio_mach_syscall(MACH_thread_self_trap, 0);
     /* FIXME i#1403: on MacOS we fail to free the app's stack: we need to pass it to
      * bsdthread_terminate.
      */
     dynamorio_syscall(SYSNUM_EXIT_THREAD, 4, 0, 0, thread_port, 0);
 #else
     dynamorio_syscall(SYSNUM_EXIT_THREAD, 1, status);
 #endif
 }

 /* FIXME: this one will not be easily internationalizable
    yet it is easier to have a syslog based Unix implementation with real strings.
  */

 void
 os_syslog(syslog_event_type_t priority, uint message_id,
           uint substitutions_num, va_list args) {
     int native_priority;
     switch (priority) {
     case SYSLOG_INFORMATION: native_priority = LOG_INFO; break;
     case SYSLOG_WARNING:     native_priority = LOG_WARNING; break;
     case SYSLOG_CRITICAL:    native_priority = LOG_CRIT; break;
     case SYSLOG_ERROR:       native_priority = LOG_ERR; break;
     default:
         ASSERT_NOT_REACHED();
     }
     /* can amount to passing a format string (careful here) to vsyslog */

     /* Never let user controlled data in the format string! */
     ASSERT_NOT_IMPLEMENTED(false);
 }

 /* This is subject to races, but should only happen at init/attach when
  * there should only be one live thread.
  */
 static bool
 safe_read_via_query(const void *base, size_t size, void *out_buf, size_t *bytes_read)
 {
     bool res = false;
     size_t num_read = 0;
     ASSERT(!fault_handling_initialized);
     /* XXX: in today's init ordering, allmem will never be initialized when we come
      * here, but we check it nevertheless to be general in case this routine is
      * ever called at some later time
      */
     if (IF_MEMQUERY_ELSE(false, memcache_initialized()))
         res = is_readable_without_exception_internal(base, size, false/*use allmem*/);
     else
         res = is_readable_without_exception_query_os((void *)base, size);
     if (res) {
         memcpy(out_buf, base, size);
         num_read = size;
     }
     if (bytes_read != NULL)
         *bytes_read = num_read;
     return res;
 }

 bool
 safe_read_ex(const void *base, size_t size, void *out_buf, size_t *bytes_read)
 {
     STATS_INC(num_safe_reads);
     /* XXX i#350: we'd like to always use safe_read_fast() and remove this extra
      * call layer, but safe_read_fast() requires fault handling to be set up.
      * We do set up an early signal handler in os_init(),
      * but there is still be a window prior to that with no handler.
      */
     if (!fault_handling_initialized) {
         return safe_read_via_query(base, size, out_buf, bytes_read);
     } else {
         return safe_read_fast(base, size, out_buf, bytes_read);
     }
 }

 /* FIXME - fold this together with safe_read_ex() (is a lot of places to update) */
 bool
 safe_read(const void *base, size_t size, void *out_buf)
 {
     return safe_read_ex(base, size, out_buf, NULL);
 }

 bool
 safe_write_ex(void *base, size_t size, const void *in_buf, size_t *bytes_written)
 {
     uint prot;
     byte *region_base;
     size_t region_size;
     dcontext_t *dcontext = get_thread_private_dcontext();
     bool res = false;
     if (bytes_written != NULL)
         *bytes_written = 0;

     if (dcontext != NULL) {
         TRY_EXCEPT(dcontext, {
             /* We abort on the 1st fault, just like safe_read */
             memcpy(base, in_buf, size);
             res = true;
          } , { /* EXCEPT */
             /* nothing: res is already false */
          });
     } else {
         /* this is subject to races, but should only happen at init/attach when
          * there should only be one live thread.
          */
         /* on x86 must be readable to be writable so start with that */
         if (is_readable_without_exception(base, size) &&
             get_memory_info_from_os(base, &region_base, &region_size, &prot) &&
             TEST(MEMPROT_WRITE, prot)) {
             size_t bytes_checked = region_size - ((byte *)base - region_base);
             while (bytes_checked < size) {
                 if (!get_memory_info_from_os(region_base + region_size, &region_base,
                                              &region_size, &prot) ||
                     !TEST(MEMPROT_WRITE, prot))
                     return false;
                 bytes_checked += region_size;
             }
         } else {
             return false;
         }
         /* ok, checks passed do the copy, FIXME - because of races this isn't safe! */
         memcpy(base, in_buf, size);
         res = true;
     }

     if (res) {
         if (bytes_written != NULL)
             *bytes_written = size;
     }
     return res;
 }

 /* is_readable_without_exception checks to see that all bytes with addresses
  * from pc to pc+size-1 are readable and that reading from there won't
  * generate an exception. if 'from_os' is true, check what the os thinks
  * the prot bits are instead of using the all memory list.
  */
 static bool
 is_readable_without_exception_internal(const byte *pc, size_t size, bool query_os)
 {
     uint prot = MEMPROT_NONE;
     byte *check_pc = (byte *) ALIGN_BACKWARD(pc, PAGE_SIZE);
     if (size > ((byte *)POINTER_MAX - pc))
         size = (byte *)POINTER_MAX - pc;
     do {
         bool rc = query_os ?
             get_memory_info_from_os(check_pc, NULL, NULL, &prot) :
             get_memory_info(check_pc, NULL, NULL, &prot);
         if (!rc || !TESTANY(MEMPROT_READ|MEMPROT_EXEC, prot))
             return false;
         if (POINTER_OVERFLOW_ON_ADD(check_pc, PAGE_SIZE))
             break;
         check_pc += PAGE_SIZE;
     } while (check_pc < pc+size);
     return true;
 }

 bool
 is_readable_without_exception(const byte *pc, size_t size)
 {
     /* case 9745 / i#853: We've had problems with all_memory_areas not being
      * accurate in the past.  Parsing proc maps is too slow for some apps, so we
      * use a runtime option.
      */
     bool query_os = IF_MEMQUERY_ELSE(true, !DYNAMO_OPTION(use_all_memory_areas));
     return is_readable_without_exception_internal(pc, size, query_os);
 }

 /* Identical to is_readable_without_exception except that the os is queried
  * for info on the indicated region */
 bool
 is_readable_without_exception_query_os(byte *pc, size_t size)
 {
     return is_readable_without_exception_internal(pc, size, true);
 }

 bool
 is_user_address(byte *pc)
 {
     /* FIXME: NYI */
     /* note returning true will always skip the case 9022 logic on Linux */
     return true;
 }

 #endif /* !NOT_DYNAMORIO_CORE_PROPER */

 /* change protections on memory region starting at pc of length length
  * this does not update the all memory area info
  */
 bool
 os_set_protection(byte *pc, size_t length, uint prot/*MEMPROT_*/)
 {
     app_pc start_page = (app_pc) PAGE_START(pc);
     uint num_bytes = ALIGN_FORWARD(length + (pc - start_page), PAGE_SIZE);
     long res = 0;
     uint flags = memprot_to_osprot(prot);
 #ifdef IA32_ON_IA64
     LOG(THREAD_GET, LOG_VMAREAS, 1, "protection change not supported on IA64\n");
     LOG(THREAD_GET, LOG_VMAREAS, 1, " attempted change_prot("PFX", "PIFX", %s) => "
         "mprotect("PFX", "PIFX")==%d pages\n",
         pc, length, memprot_string(prot), start_page, num_bytes,
         num_bytes / PAGE_SIZE);
 #else
     DOSTATS({
         /* once on each side of prot, to get on right side of writability */
         if (!TEST(PROT_WRITE, flags)) {
             STATS_INC(protection_change_calls);
             STATS_ADD(protection_change_pages, num_bytes / PAGE_SIZE);
         }
     });
     res = mprotect_syscall((void *) start_page, num_bytes, flags);
     if (res != 0)
         return false;
     LOG(THREAD_GET, LOG_VMAREAS, 3, "change_prot("PFX", "PIFX", %s) => "
         "mprotect("PFX", "PIFX", %d)==%d pages\n",
         pc, length, memprot_string(prot), start_page, num_bytes, flags,
         num_bytes / PAGE_SIZE);
 #endif
     DOSTATS({
         /* once on each side of prot, to get on right side of writability */
         if (TEST(PROT_WRITE, flags)) {
             STATS_INC(protection_change_calls);
             STATS_ADD(protection_change_pages, num_bytes / PAGE_SIZE);
         }
     });
     return true;
 }

 #ifndef NOT_DYNAMORIO_CORE_PROPER

 /* change protections on memory region starting at pc of length length */
 bool
 set_protection(byte *pc, size_t length, uint prot/*MEMPROT_*/)
 {
     if (os_set_protection(pc, length, prot) == false)
         return false;
 #ifndef HAVE_MEMINFO_QUERY
     else {
         app_pc start_page = (app_pc) PAGE_START(pc);
         uint num_bytes = ALIGN_FORWARD(length + (pc - start_page), PAGE_SIZE);
         memcache_update_locked(start_page, start_page + num_bytes,
                                prot, -1/*type unchanged*/, true/*exists*/);
     }
 #endif
     return true;
 }

 /* change protections on memory region starting at pc of length length */
 bool
 change_protection(byte *pc, size_t length, bool writable)
 {
     uint flags = (writable) ? (MEMPROT_READ|MEMPROT_WRITE) : (MEMPROT_READ);
     return set_protection(pc, length, flags);
 }

 /* make pc's page writable */
 bool
 make_writable(byte *pc, size_t size)
 {
     long res;
     app_pc start_page = (app_pc) PAGE_START(pc);
     size_t prot_size = (size == 0) ? PAGE_SIZE : size;
     uint prot = PROT_EXEC|PROT_READ|PROT_WRITE;
     /* if can get current protection then keep old read/exec flags.
      * this is crucial on modern linux kernels which refuse to mark stack +x.
      */
     if (!is_in_dynamo_dll(pc)/*avoid allmem assert*/ &&
 #ifdef STATIC_LIBRARY
         /* FIXME i#975: is_in_dynamo_dll() is always false for STATIC_LIBRARY,
          * but we can't call get_memory_info() until allmem is initialized.  Our
          * uses before then are for patching x86.asm, which is OK.
          */
         IF_NO_MEMQUERY(memcache_initialized() &&)
 #endif
         get_memory_info(pc, NULL, NULL, &prot))
         prot |= PROT_WRITE;

     ASSERT(start_page == pc && ALIGN_FORWARD(size, PAGE_SIZE) == size);
 #ifdef IA32_ON_IA64
     LOG(THREAD_GET, LOG_VMAREAS, 1, "protection change not supported on IA64\n");
     LOG(THREAD_GET, LOG_VMAREAS, 3,
         "attempted make_writable: pc "PFX" -> "PFX"-"PFX"\n",
         pc, start_page, start_page + prot_size);
 #else
     res = mprotect_syscall((void *) start_page, prot_size, prot);
     LOG(THREAD_GET, LOG_VMAREAS, 3, "make_writable: pc "PFX" -> "PFX"-"PFX" %d\n",
         pc, start_page, start_page + prot_size, res);
     ASSERT(res == 0);
     if (res != 0)
         return false;
 #endif
     STATS_INC(protection_change_calls);
     STATS_ADD(protection_change_pages, size / PAGE_SIZE);

 #ifndef HAVE_MEMINFO_QUERY
     /* update all_memory_areas list with the protection change */
     if (memcache_initialized()) {
         memcache_update_locked(start_page, start_page + prot_size,
                                osprot_to_memprot(prot),
                                -1/*type unchanged*/, true/*exists*/);
     }
 #endif
     return true;
 }

 /* like make_writable but adds COW */
 bool make_copy_on_writable(byte *pc, size_t size)
 {
     /* FIXME: for current usage this should be fine */
     return make_writable(pc, size);
 }

 /* make pc's page unwritable */
 void
 make_unwritable(byte *pc, size_t size)
 {
     long res;
     app_pc start_page = (app_pc) PAGE_START(pc);
     size_t prot_size = (size == 0) ? PAGE_SIZE : size;
     uint prot = PROT_EXEC|PROT_READ;
     /* if can get current protection then keep old read/exec flags.
      * this is crucial on modern linux kernels which refuse to mark stack +x.
      */
     if (!is_in_dynamo_dll(pc)/*avoid allmem assert*/ &&
 #ifdef STATIC_LIBRARY
         /* FIXME i#975: is_in_dynamo_dll() is always false for STATIC_LIBRARY,
          * but we can't call get_memory_info() until allmem is initialized.  Our
          * uses before then are for patching x86.asm, which is OK.
          */
         IF_NO_MEMQUERY(memcache_initialized() &&)
 #endif
         get_memory_info(pc, NULL, NULL, &prot))
         prot &= ~PROT_WRITE;

     ASSERT(start_page == pc && ALIGN_FORWARD(size, PAGE_SIZE) == size);
     /* inc stats before making unwritable, in case messing w/ data segment */
     STATS_INC(protection_change_calls);
     STATS_ADD(protection_change_pages, size / PAGE_SIZE);
 #ifdef IA32_ON_IA64
     LOG(THREAD_GET, LOG_VMAREAS, 1, "protection change not supported on IA64\n");
     LOG(THREAD_GET, LOG_VMAREAS, 3,
         "attempted make_writable: pc "PFX" -> "PFX"-"PFX"\n",
         pc, start_page, start_page + prot_size);
 #else
     res = mprotect_syscall((void *) start_page, prot_size, prot);
     LOG(THREAD_GET, LOG_VMAREAS, 3, "make_unwritable: pc "PFX" -> "PFX"-"PFX"\n",
         pc, start_page, start_page + prot_size);
     ASSERT(res == 0);

 # ifndef HAVE_MEMINFO_QUERY
     /* update all_memory_areas list with the protection change */
     if (memcache_initialized()) {
         memcache_update_locked(start_page, start_page + prot_size,
                                osprot_to_memprot(prot), -1/*type unchanged*/,
                                false/*!exists*/);
     }
 # endif
 #endif
 }

 /****************************************************************************/
 /* SYSTEM CALLS */

 /* SYS_ defines are in /usr/include/bits/syscall.h
  * numbers used by libc are in /usr/include/asm/unistd.h
  * kernel defines are in /usr/src/linux-2.4/include/asm-i386/unistd.h
  * kernel function names are in /usr/src/linux/arch/i386/kernel/entry.S
  *
  * For now, we've copied the SYS/NR defines from syscall.h and unistd.h
  * and put them in our own local syscall.h.
  */

 /* num_raw should be the xax register value.
  * For a live system call, dcontext_live should be passed (for examining
  * the dcontext->last_exit and exit_reason flags); otherwise, gateway should
  * be passed.
  */
 int
 os_normalized_sysnum(int num_raw, instr_t *gateway, dcontext_t *dcontext)
 {
 #ifdef MACOS
     /* The x64 encoding indicates the syscall type in the top 8 bits.
      * We drop the 0x2000000 for BSD so we can use the SYS_ enum constants.
      * That leaves 0x1000000 for Mach and 0x3000000 for Machdep.
      * On 32-bit, a different encoding is used: we transform that
      * to the x64 encoding minus BSD.
      */
     int interrupt = 0;
     int num = 0;
     if (gateway != NULL) {
         if (instr_is_interrupt(gateway))
             interrupt = instr_get_interrupt_number(gateway);
     } else {
         ASSERT(dcontext != NULL);
         if (TEST(LINK_SPECIAL_EXIT, dcontext->last_exit->flags)) {
             if (dcontext->upcontext.upcontext.exit_reason ==
                 EXIT_REASON_NI_SYSCALL_INT_0x81)
                 interrupt = 0x81;
             else {
                 ASSERT(dcontext->upcontext.upcontext.exit_reason ==
                        EXIT_REASON_NI_SYSCALL_INT_0x82);
                 interrupt = 0x82;
             }
         }
     }
 # ifdef X64
     if (num_raw >> 24 == 0x2)
         return (int)(num_raw & 0xffffff); /* Drop BSD bit */
     else
         num = (int) num_raw; /* Keep Mach and Machdep bits */
 # else
     if ((ptr_int_t)num_raw < 0) /* Mach syscall */
         return (SYSCALL_NUM_MARKER_MACH | -(int)num_raw);
     else {
         /* Bottom 16 bits are the number, top are arg size. */
         num = (int)(num_raw & 0xffff);
     }
 # endif
     if (interrupt == 0x81)
         num |= SYSCALL_NUM_MARKER_MACH;
     else if (interrupt == 0x82)
         num |= SYSCALL_NUM_MARKER_MACHDEP;
     return num;
 #else
     return num_raw;
 #endif
 }

 static bool
 ignorable_system_call_normalized(int num)
 {
     switch (num) {
 #if defined(SYS_exit_group)
     case SYS_exit_group:
 #endif
     case SYS_exit:
 #ifdef MACOS
     case SYS_bsdthread_terminate:
 #endif
 #ifdef LINUX
     case SYS_brk:
 #endif
     case SYS_mmap:
 #if !defined(X64) && !defined(MACOS)
     case SYS_mmap2:
 #endif
     case SYS_munmap:
 #ifdef LINUX
     case SYS_mremap:
 #endif
     case SYS_mprotect:
     case SYS_execve:
 #ifdef LINUX
     case SYS_clone:
 #elif defined(MACOS)
     case SYS_bsdthread_create:
 #endif
     case SYS_fork:
     case SYS_vfork:
     case SYS_kill:
 #if defined(SYS_tkill)
     case SYS_tkill:
 #endif
 #if defined(SYS_tgkill)
     case SYS_tgkill:
 #endif
 #if defined(LINUX) && !defined(X64)
     case SYS_signal:
 #endif
 #if !defined(X64) || defined(MACOS)
     case SYS_sigaction:
     case SYS_sigsuspend:
     case SYS_sigpending:
     case SYS_sigreturn:
     case SYS_sigprocmask:
 #endif
 #ifdef LINUX
     case SYS_rt_sigreturn:
     case SYS_rt_sigaction:
     case SYS_rt_sigprocmask:
     case SYS_rt_sigpending:
     case SYS_rt_sigtimedwait:
     case SYS_rt_sigqueueinfo:
     case SYS_rt_sigsuspend:
     case SYS_signalfd:
     case SYS_signalfd4:
 #endif
     case SYS_sigaltstack:
 #if defined(LINUX) && !defined(X64)
     case SYS_sgetmask:
     case SYS_ssetmask:
 #endif
     case SYS_setitimer:
     case SYS_getitimer:
 #ifdef MACOS
     case SYS_close_nocancel:
 #endif
     case SYS_close:
     case SYS_dup2:
 #ifdef LINUX
     case SYS_dup3:
 #endif
 #ifdef MACOS
     case SYS_fcntl_nocancel:
 #endif
     case SYS_fcntl:
     case SYS_getrlimit:
     case SYS_setrlimit:
 #ifdef LINUX
     /* i#784: app may have behavior relying on SIGALRM */
     case SYS_alarm:
 #endif
     /* i#107: syscall might change/query app's seg memory
      * need stop app from clobbering our GDT slot.
      */
 #if defined(LINUX) && defined(X64)
     case SYS_arch_prctl:
 #endif
 #ifdef LINUX
     case SYS_set_thread_area:
     case SYS_get_thread_area:
     /* FIXME: we might add SYS_modify_ldt later. */
 #endif
         return false;
     default:
 #ifdef VMX86_SERVER
         if (is_vmkuw_sysnum(num))
             return vmkuw_ignorable_system_call(num);
 #endif
         return true;
     }
 }

 bool
 ignorable_system_call(int num_raw, instr_t *gateway, dcontext_t *dcontext_live)
 {
     return ignorable_system_call_normalized
         (os_normalized_sysnum(num_raw, gateway, dcontext_live));
 }

 typedef struct {
         unsigned long addr;
         unsigned long len;
         unsigned long prot;
         unsigned long flags;
         unsigned long fd;
         unsigned long offset;
 } mmap_arg_struct_t;

 #endif /* !NOT_DYNAMORIO_CORE_PROPER: around most of file, to exclude preload */

 const reg_id_t syscall_regparms[MAX_SYSCALL_ARGS] = {
 #ifdef X86
 # ifdef X64
     DR_REG_RDI,
     DR_REG_RSI,
     DR_REG_RDX,
     DR_REG_R10,  /* RCX goes here in normal x64 calling contention. */
     DR_REG_R8,
     DR_REG_R9
 # else
     DR_REG_EBX,
     DR_REG_ECX,
     DR_REG_EDX,
     DR_REG_ESI,
     DR_REG_EDI,
     DR_REG_EBP
 # endif /* 64/32-bit */
 #elif defined(ARM)
 # ifdef X64
 #  error AArch64 syscall not supported
 # else
     DR_REG_R0,
     DR_REG_R1,
     DR_REG_R2,
     DR_REG_R3,
     DR_REG_R4,
     DR_REG_R5,
 # endif /* 64/32-bit */
 #endif /* X86/ARM */
 };

 #ifndef NOT_DYNAMORIO_CORE_PROPER

 static inline reg_t *
 sys_param_addr(dcontext_t *dcontext, int num)
 {
     /* we force-inline get_mcontext() and so don't take it as a param */
     priv_mcontext_t *mc = get_mcontext(dcontext);
 #ifdef X64
     switch (num) {
     case 0: return &mc->xdi;
     case 1: return &mc->xsi;
     case 2: return &mc->xdx;
     case 3: return &mc->r10; /* since rcx holds retaddr for syscall instr */
     case 4: return &mc->r8;
     case 5: return &mc->r9;
     default: CLIENT_ASSERT(false, "invalid system call parameter number");
     }
 #else
 # ifdef MACOS
     /* XXX: if we don't end up using dcontext->sys_was_int here, we could
      * make that field Linux-only.
      */
     /* For 32-bit, the args are passed on the stack, above a retaddr slot
      * (regardless of whether using a sysenter or int gateway).
      */
     return ((reg_t *)mc->esp) + 1/*retaddr*/ + num;
 # endif
     /* even for vsyscall where ecx (syscall) or esp (sysenter) are saved into
      * ebp, the original parameter registers are not yet changed pre-syscall,
      * except for ebp, which is pushed on the stack:
      *     0xffffe400  55                   push   %ebp %esp -> %esp (%esp)
      *     0xffffe401  89 cd                mov    %ecx -> %ebp
      *     0xffffe403  0f 05                syscall -> %ecx
      *
      *     0xffffe400  51                   push   %ecx %esp -> %esp (%esp)
      *     0xffffe401  52                   push   %edx %esp -> %esp (%esp)
      *     0xffffe402  55                   push   %ebp %esp -> %esp (%esp)
      *     0xffffe403  89 e5                mov    %esp -> %ebp
      *     0xffffe405  0f 34                sysenter -> %esp
      */
     switch (num) {
     case 0: return &mc->IF_X86_ELSE(xbx, r0);
     case 1: return &mc->IF_X86_ELSE(xcx, r1);
     case 2: return &mc->IF_X86_ELSE(xdx, r2);
     case 3: return &mc->IF_X86_ELSE(xsi, r3);
     case 4: return &mc->IF_X86_ELSE(xdi, r4);
     /* FIXME: do a safe_read: but what about performance?
      * See the #if 0 below, as well. */
     case 5: return IF_X86_ELSE((dcontext->sys_was_int ? &mc->xbp : ((reg_t*)mc->xsp)),
                                &mc->r5);
     default: CLIENT_ASSERT(false, "invalid system call parameter number");
     }
 #endif
     return 0;
 }

 static inline reg_t
 sys_param(dcontext_t *dcontext, int num)
 {
     return *sys_param_addr(dcontext, num);
 }

 static inline bool
 syscall_successful(priv_mcontext_t *mc, int normalized_sysnum)
 {
 #ifdef MACOS
     if (TEST(SYSCALL_NUM_MARKER_MACH, normalized_sysnum)) {
         /* XXX: Mach syscalls vary (for some KERN_SUCCESS=0 is success,
          * for others that return mach_port_t 0 is failure (I think?).
          * We defer to drsyscall.
          */
         return ((ptr_int_t)MCXT_SYSCALL_RES(mc) >= 0);
     } else
         return !TEST(EFLAGS_CF, mc->eflags);
 #else
     if (normalized_sysnum == SYS_mmap ||
 # ifndef X64
         normalized_sysnum == SYS_mmap2 ||
 # endif
         normalized_sysnum == SYS_mremap)
         return mmap_syscall_succeeded((byte *)MCXT_SYSCALL_RES(mc));
     return ((ptr_int_t)MCXT_SYSCALL_RES(mc) >= 0);
 #endif
 }

 /* For non-Mac, this does nothing to indicate "success": you can pass -errno.
  * For Mac, this clears CF and just sets xax.  To return a 64-bit value in
  * 32-bit mode, the caller must explicitly set xdx as well (we don't always
  * do so b/c syscalls that just return 32-bit values do not touch xdx).
  */
 static inline void
 set_success_return_val(dcontext_t *dcontext, reg_t val)
 {
     /* since always coming from dispatch now, only need to set mcontext */
     priv_mcontext_t *mc = get_mcontext(dcontext);
 #ifdef MACOS
     /* On MacOS, success is determined by CF, except for Mach syscalls, but
      * there it doesn't hurt to set CF.
      */
     mc->eflags &= ~(EFLAGS_CF);
 #endif
     MCXT_SYSCALL_RES(mc) = val;
 }

 /* Always pass a positive value for errno */
 static inline void
 set_failure_return_val(dcontext_t *dcontext, uint errno)
 {
     priv_mcontext_t *mc = get_mcontext(dcontext);
 #ifdef MACOS
     /* On MacOS, success is determined by CF, and errno is positive */
     mc->eflags |= EFLAGS_CF;
     MCXT_SYSCALL_RES(mc) = errno;
 #else
     MCXT_SYSCALL_RES(mc) = -(int)errno;
 #endif
 }

 #ifdef CLIENT_INTERFACE
 DR_API
 reg_t
 dr_syscall_get_param(void *drcontext, int param_num)
 {
     dcontext_t *dcontext = (dcontext_t *) drcontext;
     CLIENT_ASSERT(dcontext->client_data->in_pre_syscall,
                   "dr_syscall_get_param() can only be called from pre-syscall event");
     return sys_param(dcontext, param_num);
 }

 DR_API
 void
 dr_syscall_set_param(void *drcontext, int param_num, reg_t new_value)
 {
     dcontext_t *dcontext = (dcontext_t *) drcontext;
     CLIENT_ASSERT(dcontext->client_data->in_pre_syscall ||
                   dcontext->client_data->in_post_syscall,
                   "dr_syscall_set_param() can only be called from a syscall event");
     *sys_param_addr(dcontext, param_num) = new_value;
 }

 DR_API
 reg_t
 dr_syscall_get_result(void *drcontext)
 {
     dcontext_t *dcontext = (dcontext_t *) drcontext;
     CLIENT_ASSERT(dcontext->client_data->in_post_syscall,
                   "dr_syscall_get_param() can only be called from post-syscall event");
     return MCXT_SYSCALL_RES(get_mcontext(dcontext));
 }

 DR_API
 bool
 dr_syscall_get_result_ex(void *drcontext, dr_syscall_result_info_t *info INOUT)
 {
     dcontext_t *dcontext = (dcontext_t *) drcontext;
     priv_mcontext_t *mc = get_mcontext(dcontext);
     CLIENT_ASSERT(dcontext->client_data->in_post_syscall,
                   "only call dr_syscall_get_param_ex() from post-syscall event");
     CLIENT_ASSERT(info != NULL, "invalid parameter");
     CLIENT_ASSERT(info->size == sizeof(*info), "invalid dr_syscall_result_info_t size");
     if (info->size != sizeof(*info))
         return false;
     info->value = MCXT_SYSCALL_RES(mc);
     info->succeeded = syscall_successful(mc, dcontext->sys_num);
     if (info->use_high) {
         /* MacOS has some 32-bit syscalls that return 64-bit values in
          * xdx:xax, but the other syscalls don't clear xdx, so we can't easily
          * return a 64-bit value all the time.
          */
         IF_X86_ELSE({
             info->high = mc->xdx;
         }, {
             ASSERT_NOT_REACHED();
         });
     }
     if (info->use_errno) {
         if (info->succeeded)
             info->errno_value = 0;
         else {
             info->errno_value = (uint)IF_LINUX(-(int))MCXT_SYSCALL_RES(mc);
         }
     }
     return true;
 }

 DR_API
 void
 dr_syscall_set_result(void *drcontext, reg_t value)
 {
     dcontext_t *dcontext = (dcontext_t *) drcontext;
     CLIENT_ASSERT(dcontext->client_data->in_pre_syscall ||
                   dcontext->client_data->in_post_syscall,
                   "dr_syscall_set_result() can only be called from a syscall event");
     /* For non-Mac, the caller can still pass -errno and this will work */
     set_success_return_val(dcontext, value);
 }

 DR_API
 bool
 dr_syscall_set_result_ex(void *drcontext, dr_syscall_result_info_t *info)
 {
     dcontext_t *dcontext = (dcontext_t *) drcontext;
     priv_mcontext_t *mc = get_mcontext(dcontext);
     CLIENT_ASSERT(dcontext->client_data->in_pre_syscall ||
                   dcontext->client_data->in_post_syscall,
                   "dr_syscall_set_result() can only be called from a syscall event");
     CLIENT_ASSERT(info->size == sizeof(*info), "invalid dr_syscall_result_info_t size");
     if (info->size != sizeof(*info))
         return false;
     if (info->use_errno) {
         if (info->succeeded) {
             /* a weird case but we let the user combine these */
             set_success_return_val(dcontext, info->errno_value);
         } else
             set_failure_return_val(dcontext, info->errno_value);
     } else {
         if (info->succeeded)
             set_success_return_val(dcontext, info->value);
         else {
             /* use this to set CF, even though it might negate the value */
             set_failure_return_val(dcontext, (uint)info->value);
             /* now set the value, overriding set_failure_return_val() */
             MCXT_SYSCALL_RES(mc) = info->value;
         }
         if (info->use_high) {
             /* MacOS has some 32-bit syscalls that return 64-bit values in
              * xdx:xax.
              */
             IF_X86_ELSE({
                 mc->xdx = info->high;
             }, {
                 ASSERT_NOT_REACHED();
             });
         }
     }
     return true;
 }

 DR_API
 void
 dr_syscall_set_sysnum(void *drcontext, int new_num)
 {
     dcontext_t *dcontext = (dcontext_t *) drcontext;
     priv_mcontext_t *mc = get_mcontext(dcontext);
     CLIENT_ASSERT(dcontext->client_data->in_pre_syscall ||
                   dcontext->client_data->in_post_syscall,
                   "dr_syscall_set_sysnum() can only be called from a syscall event");
     MCXT_SYSNUM_REG(mc) = new_num;
 }

 DR_API
 void
 dr_syscall_invoke_another(void *drcontext)
 {
     dcontext_t *dcontext = (dcontext_t *) drcontext;
     CLIENT_ASSERT(dcontext->client_data->in_post_syscall,
                   "dr_syscall_invoke_another() can only be called from post-syscall event");
     LOG(THREAD, LOG_SYSCALLS, 2, "invoking additional syscall on client request\n");
     dcontext->client_data->invoke_another_syscall = true;
 # ifdef X86
     if (get_syscall_method() == SYSCALL_METHOD_SYSENTER) {
         priv_mcontext_t *mc = get_mcontext(dcontext);
         /* restore xbp to xsp */
         mc->xbp = mc->xsp;
     }
 # endif /* X86 */
     /* for x64 we don't need to copy xcx into r10 b/c we use r10 as our param */
 }
 #endif /* CLIENT_INTERFACE */

 static inline bool
 is_thread_create_syscall_helper(ptr_uint_t sysnum, ptr_uint_t flags)
 {
 #ifdef MACOS
     /* XXX i#1403: we need earlier injection to intercept
      * bsdthread_register in order to capture workqueue threads.
      */
     return (sysnum == SYS_bsdthread_create || sysnum == SYS_vfork);
 #else
     return (sysnum == SYS_vfork
             IF_LINUX(|| (sysnum == SYS_clone && TEST(CLONE_VM, flags))));
 #endif
 }


 bool
 is_thread_create_syscall(dcontext_t *dcontext)
 {
     priv_mcontext_t *mc = get_mcontext(dcontext);
     return is_thread_create_syscall_helper(MCXT_SYSNUM_REG(mc),
                                            sys_param(dcontext, 0));
 }

 bool
 was_thread_create_syscall(dcontext_t *dcontext)
 {
     return is_thread_create_syscall_helper(dcontext->sys_num,
                                            /* flags in param0 */
                                            dcontext->sys_param0);
 }

 static inline bool
 is_sigreturn_syscall_helper(int sysnum)
 {
 #ifdef MACOS
     return sysnum == SYS_sigreturn;
 #else
     return (IF_NOT_X64(sysnum == SYS_sigreturn ||) sysnum == SYS_rt_sigreturn);
 #endif
 }

 bool
 is_sigreturn_syscall(dcontext_t *dcontext)
 {
     priv_mcontext_t *mc = get_mcontext(dcontext);
     return is_sigreturn_syscall_helper(MCXT_SYSNUM_REG(mc));
 }

 bool
 was_sigreturn_syscall(dcontext_t *dcontext)
 {
     return is_sigreturn_syscall_helper(dcontext->sys_num);
 }

 /* process a signal this process/thread is sending to itself */
 static void
 handle_self_signal(dcontext_t *dcontext, uint sig)
 {
     /* FIXME PR 297903: watch for all DEFAULT_TERMINATE signals,
      * and for any thread in the group, not just self.
      *
      * FIXME PR 297033: watch for SIGSTOP and SIGCONT.
      *
      * With -intercept_all_signals, we only need to watch for SIGKILL
      * and SIGSTOP here, and we avoid the FIXMEs below.  If it's fine
      * for DR not to clean up on a SIGKILL, then SIGSTOP is all that's
      * left (at least once we have PR 297033 and are intercepting the
      * various STOP variations and CONT).
      */
     if (sig == SIGABRT && !DYNAMO_OPTION(intercept_all_signals)) {
         LOG(GLOBAL, LOG_TOP|LOG_SYSCALLS, 1,
             "thread "TIDFMT" sending itself a SIGABRT\n", get_thread_id());
         KSTOP(num_exits_dir_syscall);
         /* FIXME: need to check whether app has a handler for SIGABRT! */
         /* FIXME PR 211180/6723: this will do SYS_exit rather than the SIGABRT.
          * Should do set_default_signal_action(SIGABRT) (and set a flag so
          * no races w/ another thread re-installing?) and then SYS_kill.
          */
         cleanup_and_terminate(dcontext, SYSNUM_EXIT_THREAD, -1, 0,
                               (is_last_app_thread() && !dynamo_exited),
                               IF_MACOS_ELSE(dcontext->thread_port, 0), 0);
         ASSERT_NOT_REACHED();
     }
 }

 /***************************************************************************
  * EXECVE
  */

 /* when adding here, also add to the switch in handle_execve if necessary */
 enum {
     ENV_PROP_RUNUNDER,
     ENV_PROP_OPTIONS,
     ENV_PROP_EXECVE_LOGDIR,
 };

 static const char * const env_to_propagate[] = {
     /* these must line up with the enum */
     DYNAMORIO_VAR_RUNUNDER,
     DYNAMORIO_VAR_OPTIONS,
     /* DYNAMORIO_VAR_EXECVE_LOGDIR is different from DYNAMORIO_VAR_LOGDIR:
      * - DYNAMORIO_VAR_LOGDIR: a parent dir inside which a new dir will be created;
      * - DYNAMORIO_VAR_EXECVE_LOGDIR: the same subdir with the pre-execve process.
      * Xref comment in create_log_dir about their precedence.
      */
     DYNAMORIO_VAR_EXECVE_LOGDIR,
     /* these will only be propagated if they exist */
     DYNAMORIO_VAR_CONFIGDIR,
 };
 #define NUM_ENV_TO_PROPAGATE (sizeof(env_to_propagate)/sizeof(env_to_propagate[0]))

 /* called at pre-SYS_execve to append DR vars in the target process env vars list */
 static void
 add_dr_env_vars(dcontext_t *dcontext, char *inject_library_path)
 {
     char **envp = (char **) sys_param(dcontext, 2);
     int idx, j, preload = -1, ldpath = -1;
     int num_old, num_new, sz;
     bool need_var[NUM_ENV_TO_PROPAGATE];
     int  prop_idx[NUM_ENV_TO_PROPAGATE];
     bool ldpath_us = false, preload_us = false;
     char **new_envp, *var, *old;

     /* check if any var needs to be propagated */
     for (j = 0; j < NUM_ENV_TO_PROPAGATE; j++) {
         prop_idx[j] = -1;
         if (get_config_val(env_to_propagate[j]) == NULL)
             need_var[j] = false;
         else
             need_var[j] = true;
     }
     /* Special handling for DYNAMORIO_VAR_EXECVE_LOGDIR:
      * we only need it if follow_children is true and PROCESS_DIR exists.
      */
     if (DYNAMO_OPTION(follow_children) && get_log_dir(PROCESS_DIR, NULL, NULL))
         need_var[ENV_PROP_EXECVE_LOGDIR] = true;
     else
         need_var[ENV_PROP_EXECVE_LOGDIR] = false;

     /* iterate the env in target process  */
     if (envp == NULL) {
         LOG(THREAD, LOG_SYSCALLS, 3, "\tenv is NULL\n");
         idx = 0;
     } else {
         for (idx = 0; envp[idx] != NULL; idx++) {
             /* execve env vars should never be set here */
             ASSERT(strstr(envp[idx], DYNAMORIO_VAR_EXECVE) != envp[idx]);
             for (j = 0; j < NUM_ENV_TO_PROPAGATE; j++) {
                 if (strstr(envp[idx], env_to_propagate[j]) == envp[idx]) {
                     /* If conflict between env and cfg, we assume those env vars
                      * are for DR usage only, and replace them with cfg value.
                      */
                     prop_idx[j] = idx; /* remember the index for replacing later */
                     break;
                 }
             }
             if (strstr(envp[idx], "LD_LIBRARY_PATH=") == envp[idx]) {
                 ldpath = idx;
                 if (strstr(envp[idx], inject_library_path) != NULL)
                     ldpath_us = true;
             }
             if (strstr(envp[idx], "LD_PRELOAD=") == envp[idx]) {
                 preload = idx;
                 if (strstr(envp[idx], DYNAMORIO_PRELOAD_NAME) != NULL &&
                     strstr(envp[idx], DYNAMORIO_LIBRARY_NAME) != NULL) {
                     preload_us = true;
                 }
             }
             LOG(THREAD, LOG_SYSCALLS, 3, "\tenv %d: %s\n", idx, envp[idx]);
         }
     }

     /* We want to add new env vars, so we create a new envp
      * array.  We have to deallocate them and restore the old
      * envp if execve fails; if execve succeeds, the address
      * space is reset so we don't need to do anything.
      */
     num_old = idx;
     /* how many new env vars we need add */
     num_new =
         2 + /* execve indicator var plus final NULL */
         ((preload<0) ? 1 : 0) +
         ((ldpath<0) ? 1 : 0);

     if (DYNAMO_OPTION(follow_children)) {
         for (j = 0; j < NUM_ENV_TO_PROPAGATE; j++) {
             if (need_var[j] && prop_idx[j] < 0)
                 num_new++;
         }
     }
     /* setup new envp */
     new_envp = heap_alloc(dcontext, sizeof(char*)*(num_old+num_new)
                           HEAPACCT(ACCT_OTHER));
     /* copy old envp */
     memcpy(new_envp, envp, sizeof(char*)*num_old);
     /* change/add preload and ldpath if necessary */
     if (!preload_us) {
         int idx_preload;
         LOG(THREAD, LOG_SYSCALLS, 1,
             "WARNING: execve env does NOT preload DynamoRIO, forcing it!\n");
         if (preload >= 0) {
             /* replace the existing preload */
             sz = strlen(envp[preload]) + strlen(DYNAMORIO_PRELOAD_NAME)+
                 strlen(DYNAMORIO_LIBRARY_NAME) + 3;
             var = heap_alloc(dcontext, sizeof(char)*sz HEAPACCT(ACCT_OTHER));
             old = envp[preload] + strlen("LD_PRELOAD=");
             snprintf(var, sz, "LD_PRELOAD=%s %s %s",
                      DYNAMORIO_PRELOAD_NAME, DYNAMORIO_LIBRARY_NAME, old);
             idx_preload = preload;
         } else {
             /* add new preload */
             sz = strlen("LD_PRELOAD=") + strlen(DYNAMORIO_PRELOAD_NAME) +
                 strlen(DYNAMORIO_LIBRARY_NAME) + 2;
             var = heap_alloc(dcontext, sizeof(char)*sz HEAPACCT(ACCT_OTHER));
             snprintf(var, sz, "LD_PRELOAD=%s %s",
                      DYNAMORIO_PRELOAD_NAME, DYNAMORIO_LIBRARY_NAME);
             idx_preload = idx++;
         }
         *(var+sz-1) = '\0'; /* null terminate */
         new_envp[idx_preload] = var;
         LOG(THREAD, LOG_SYSCALLS, 2, "\tnew env %d: %s\n",
             idx_preload, new_envp[idx_preload]);
     }
     if (!ldpath_us) {
         int idx_ldpath;
         if (ldpath >= 0) {
             sz = strlen(envp[ldpath]) + strlen(inject_library_path) + 2;
             var = heap_alloc(dcontext, sizeof(char)*sz HEAPACCT(ACCT_OTHER));
             old = envp[ldpath] + strlen("LD_LIBRARY_PATH=");
             snprintf(var, sz, "LD_LIBRARY_PATH=%s:%s", inject_library_path, old);
             idx_ldpath = ldpath;
         } else {
             sz = strlen("LD_LIBRARY_PATH=") + strlen(inject_library_path) + 1;
             var = heap_alloc(dcontext, sizeof(char)*sz HEAPACCT(ACCT_OTHER));
             snprintf(var, sz, "LD_LIBRARY_PATH=%s", inject_library_path);
             idx_ldpath = idx++;
         }
         *(var+sz-1) = '\0'; /* null terminate */
         new_envp[idx_ldpath] = var;
         LOG(THREAD, LOG_SYSCALLS, 2, "\tnew env %d: %s\n",
             idx_ldpath, new_envp[idx_ldpath]);
     }
     /* propagating DR env vars */
     if (DYNAMO_OPTION(follow_children)) {
         for (j = 0; j < NUM_ENV_TO_PROPAGATE; j++) {
             const char *val = "";
             if (!need_var[j])
                 continue;
             switch (j) {
             case ENV_PROP_RUNUNDER:
                 ASSERT(strcmp(env_to_propagate[j], DYNAMORIO_VAR_RUNUNDER) == 0);
                 /* Must pass RUNUNDER_ALL to get child injected if has no app config.
                  * If rununder var is already set we assume it's set to 1.
                  */
                 ASSERT((RUNUNDER_ON | RUNUNDER_ALL) == 0x3); /* else, update "3" */
                 val = "3";
                 break;
             case ENV_PROP_OPTIONS:
                 ASSERT(strcmp(env_to_propagate[j], DYNAMORIO_VAR_OPTIONS) == 0);
                 val = option_string;
                 break;
             case ENV_PROP_EXECVE_LOGDIR:
                 /* we use PROCESS_DIR for DYNAMORIO_VAR_EXECVE_LOGDIR */
                 ASSERT(strcmp(env_to_propagate[j], DYNAMORIO_VAR_EXECVE_LOGDIR) == 0);
                 ASSERT(get_log_dir(PROCESS_DIR, NULL, NULL));
                 break;
             default:
                 val = getenv(env_to_propagate[j]);
                 if (val == NULL)
                     val = "";
                 break;
             }
             if (j == ENV_PROP_EXECVE_LOGDIR) {
                 uint logdir_length;
                 get_log_dir(PROCESS_DIR, NULL, &logdir_length);
                 /* logdir_length includes the terminating NULL */
                 sz = strlen(DYNAMORIO_VAR_EXECVE_LOGDIR) + logdir_length + 1/* '=' */;
                 var = heap_alloc(dcontext, sizeof(char)*sz HEAPACCT(ACCT_OTHER));
                 snprintf(var, sz, "%s=", DYNAMORIO_VAR_EXECVE_LOGDIR);
                 get_log_dir(PROCESS_DIR, var+strlen(var), &logdir_length);
             } else {
                 sz = strlen(env_to_propagate[j]) + strlen(val) + 2 /* '=' + null */;
                 var = heap_alloc(dcontext, sizeof(char)*sz HEAPACCT(ACCT_OTHER));
                 snprintf(var, sz, "%s=%s", env_to_propagate[j], val);
             }
             *(var+sz-1) = '\0'; /* null terminate */
             prop_idx[j] = (prop_idx[j] >= 0) ? prop_idx[j] : idx++;
             new_envp[prop_idx[j]] = var;
             LOG(THREAD, LOG_SYSCALLS, 2, "\tnew env %d: %s\n",
                 prop_idx[j], new_envp[prop_idx[j]]);
         }
     } else {
         if (prop_idx[ENV_PROP_RUNUNDER] >= 0) {
             /* disable auto-following of this execve, yet still allow preload
              * on other side to inject if config file exists.
              * kind of hacky mangle here:
              */
             ASSERT(!need_var[ENV_PROP_RUNUNDER]);
             ASSERT(new_envp[prop_idx[ENV_PROP_RUNUNDER]][0] == 'D');
             new_envp[prop_idx[ENV_PROP_RUNUNDER]][0] = 'X';
         }
     }

     sz = strlen(DYNAMORIO_VAR_EXECVE) + 4;
     /* we always pass this var to indicate "post-execve" */
     var = heap_alloc(dcontext, sizeof(char)*sz HEAPACCT(ACCT_OTHER));
     /* PR 458917: we overload this to also pass our gdt index */
     ASSERT(os_tls_get_gdt_index(dcontext) < 100 &&
            os_tls_get_gdt_index(dcontext) >= -1); /* only 2 chars allocated */
     snprintf(var, sz, "%s=%02d", DYNAMORIO_VAR_EXECVE, os_tls_get_gdt_index(dcontext));
     *(var+sz-1) = '\0'; /* null terminate */
     new_envp[idx++] = var;
     LOG(THREAD, LOG_SYSCALLS, 2, "\tnew env %d: %s\n", idx-1, new_envp[idx-1]);
     /* must end with NULL */
     new_envp[idx++] = NULL;
     ASSERT((num_new + num_old) == idx);

     /* update syscall param */
     *sys_param_addr(dcontext, 2) = (reg_t) new_envp;  /* OUT */
     /* store for reset in case execve fails, and for cleanup if
      * this is a vfork thread
      */
     dcontext->sys_param0 = (reg_t) envp;
     dcontext->sys_param1 = (reg_t) new_envp;
 }

 static void
 handle_execve(dcontext_t *dcontext)
 {
     /* in /usr/src/linux/arch/i386/kernel/process.c:
      *  asmlinkage int sys_execve(struct pt_regs regs) { ...
      *  error = do_execve(filename, (char **) regs.xcx, (char **) regs.xdx, &regs);
      * in fs/exec.c:
      *  int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs)
      */
     /* We need to make sure we get injected into the new image:
      * we simply make sure LD_PRELOAD contains us, and that our directory
      * is on LD_LIBRARY_PATH (seems not to work to put absolute paths in
      * LD_PRELOAD).
      * FIXME: this doesn't work for setuid programs
      *
      * For -follow_children we also pass the current DYNAMORIO_RUNUNDER and
      * DYNAMORIO_OPTIONS and logdir to the new image to support a simple
      * run-all-children model without bothering w/ setting up config files for
      * children, and to support injecting across execve that does not
      * preserve $HOME.
      * FIXME i#287/PR 546544: we'll need to propagate DYNAMORIO_AUTOINJECT too
      * once we use it in preload
      */
     /* FIXME i#191: supposed to preserve things like pending signal
      * set across execve: going to ignore for now
      */
     char *fname = (char *)  sys_param(dcontext, 0);
     bool x64 = IF_X64_ELSE(true, false);
     file_t file;
     char *inject_library_path;
     DEBUG_DECLARE(char **argv = (char **) sys_param(dcontext, 1);)

     LOG(GLOBAL, LOG_ALL, 1, "\n---------------------------------------------------------------------------\n");
     LOG(THREAD, LOG_ALL, 1, "\n---------------------------------------------------------------------------\n");
     DODEBUG({
         int i;
         SYSLOG_INTERNAL_INFO("-- execve %s --", fname);
         LOG(THREAD, LOG_SYSCALLS, 1, "syscall: execve %s\n", fname);
         LOG(GLOBAL, LOG_TOP|LOG_SYSCALLS, 1, "execve %s\n", fname);
         if (stats->loglevel >= 3) {
             if (argv == NULL) {
                 LOG(THREAD, LOG_SYSCALLS, 3, "\targs are NULL\n");
             } else {
                 for (i = 0; argv[i] != NULL; i++) {
                     LOG(THREAD, LOG_SYSCALLS, 2, "\targ %d: len=%d\n",
                         i, strlen(argv[i]));
                     LOG(THREAD, LOG_SYSCALLS, 3, "\targ %d: %s\n",
                         i, argv[i]);
                 }
             }
         }
     });

     /* i#237/PR 498284: if we're a vfork "thread" we're really in a different
      * process and if we exec then the parent process will still be alive.  We
      * can't easily clean our own state (dcontext, dstack, etc.) up in our
      * parent process: we need it to invoke the syscall and the syscall might
      * fail.  We could expand cleanup_and_terminate to also be able to invoke
      * SYS_execve: but execve seems more likely to fail than termination
      * syscalls.  Our solution is to mark this thread as "execve" and hide it
      * from regular thread queries; we clean it up in the process-exiting
      * synch_with_thread(), or if the same parent thread performs another vfork
      * (to prevent heap accumulation from repeated vfork+execve).  Since vfork
      * on linux suspends the parent, there cannot be any races with the execve
      * syscall completing: there can't even be peer vfork threads, so we could
      * set a flag and clean up in dispatch, but that seems overkill.  (If vfork
      * didn't suspend the parent we'd need to touch a marker file or something
      * to know the execve was finished.)
      */
     mark_thread_execve(dcontext->thread_record, true);

 #ifdef STATIC_LIBRARY
     /* no way we can inject, we just lose control */
     SYSLOG_INTERNAL_WARNING("WARNING: static DynamoRIO library, losing control on execve");
     return;
 #endif

     /* Issue 20: handle cross-architecture execve */
     /* Xref alternate solution i#145: use dual paths on
      * LD_LIBRARY_PATH to solve cross-arch execve
      */
     file = os_open(fname, OS_OPEN_READ);
     if (file != INVALID_FILE) {
         x64 = module_file_is_module64(file);
         os_close(file);
     }
     inject_library_path = IF_X64_ELSE(x64, !x64) ? dynamorio_library_path :
         dynamorio_alt_arch_path;

     add_dr_env_vars(dcontext, inject_library_path);

     /* we need to clean up the .1config file here.  if the execve fails,
      * we'll just live w/o dynamic option re-read.
      */
     config_exit();
 }

 static void
 handle_execve_post(dcontext_t *dcontext)
 {
     /* if we get here it means execve failed (doesn't return on success),
      * or we did an execve from a vfork and its memory changes are visible
      * in the parent process.
      * we have to restore env to how it was and free the allocated heap.
      */
     char **old_envp = (char **) dcontext->sys_param0;
     char **new_envp = (char **) dcontext->sys_param1;
 #ifdef STATIC_LIBRARY
     /* nothing to clean up */
     return;
 #endif
     if (new_envp != NULL) {
         int i;
         LOG(THREAD, LOG_SYSCALLS, 2, "\tcleaning up our env vars\n");
         /* we replaced existing ones and/or added new ones.
          * we can't compare to old_envp b/c it may have changed by now.
          */
         for (i=0; new_envp[i] != NULL; i++) {
             if (is_dynamo_address((byte *)new_envp[i])) {
                 heap_free(dcontext, new_envp[i],
                           sizeof(char)*(strlen(new_envp[i])+1)
                           HEAPACCT(ACCT_OTHER));
             }
         }
         i++; /* need to de-allocate final null slot too */
         heap_free(dcontext, new_envp, sizeof(char*)*i HEAPACCT(ACCT_OTHER));
         /* restore prev envp if we're post-syscall */
         if (!dcontext->thread_record->execve)
             *sys_param_addr(dcontext, 2) = (reg_t) old_envp;
     }
 }

 /* i#237/PR 498284: to avoid accumulation of thread state we clean up a vfork
  * child who invoked execve here so we have at most one outstanding thread.  we
  * also clean up at process exit and before thread creation.  we could do this
  * in dispatch but too rare to be worth a flag check there.
  */
 static void
 cleanup_after_vfork_execve(dcontext_t *dcontext)
 {
     thread_record_t **threads;
     int num_threads, i;
     if (num_execve_threads == 0)
         return;

     mutex_lock(&thread_initexit_lock);
     get_list_of_threads_ex(&threads, &num_threads, true/*include execve*/);
     for (i=0; i<num_threads; i++) {
         if (threads[i]->execve) {
             LOG(THREAD, LOG_SYSCALLS, 2, "cleaning up earlier vfork thread "TIDFMT"\n",
                 threads[i]->id);
             dynamo_other_thread_exit(threads[i]);
         }
     }
     mutex_unlock(&thread_initexit_lock);
     global_heap_free(threads, num_threads*sizeof(thread_record_t*)
                      HEAPACCT(ACCT_THREAD_MGT));
 }

 /* returns whether to execute syscall */
 static bool
 handle_close_pre(dcontext_t *dcontext)
 {
     /* in fs/open.c: asmlinkage long sys_close(unsigned int fd) */
     uint fd = (uint) sys_param(dcontext, 0);
     LOG(THREAD, LOG_SYSCALLS, 3, "syscall: close fd %d\n", fd);

     /* prevent app from closing our files */
     if (fd_is_dr_owned(fd)) {
         SYSLOG_INTERNAL_WARNING_ONCE("app trying to close DR file(s)");
         LOG(THREAD, LOG_TOP|LOG_SYSCALLS, 1,
             "WARNING: app trying to close DR file %d!  Not allowing it.\n", fd);
         set_failure_return_val(dcontext, EBADF);
         DODEBUG({ dcontext->expect_last_syscall_to_fail = true; });
         return false; /* do not execute syscall */
     }

     /* Xref PR 258731 - duplicate STDOUT/STDERR when app closes them so we (or
      * a client) can continue to use them for logging. */
     if (DYNAMO_OPTION(dup_stdout_on_close) && fd == STDOUT) {
         our_stdout = fd_priv_dup(fd);
         if (our_stdout < 0) /* no private fd available */
             our_stdout = dup_syscall(fd);
         if (our_stdout >= 0)
             fd_mark_close_on_exec(our_stdout);
         fd_table_add(our_stdout, 0);
         LOG(THREAD, LOG_TOP|LOG_SYSCALLS, 1,
             "WARNING: app is closing stdout=%d - duplicating descriptor for "
             "DynamoRIO usage got %d.\n", fd, our_stdout);
         if (privmod_stdout != NULL &&
             IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
             /* update the privately loaded libc's stdout _fileno. */
             (*privmod_stdout)->STDFILE_FILENO = our_stdout;
         }
     }
     if (DYNAMO_OPTION(dup_stderr_on_close) && fd == STDERR) {
         our_stderr = fd_priv_dup(fd);
         if (our_stderr < 0) /* no private fd available */
             our_stderr = dup_syscall(fd);
         if (our_stderr >= 0)
             fd_mark_close_on_exec(our_stderr);
         fd_table_add(our_stderr, 0);
         LOG(THREAD, LOG_TOP|LOG_SYSCALLS, 1,
             "WARNING: app is closing stderr=%d - duplicating descriptor for "
             "DynamoRIO usage got %d.\n", fd, our_stderr);
         if (privmod_stderr != NULL &&
             IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
             /* update the privately loaded libc's stderr _fileno. */
             (*privmod_stderr)->STDFILE_FILENO = our_stderr;
         }
     }
     if (DYNAMO_OPTION(dup_stdin_on_close) && fd == STDIN) {
         our_stdin = fd_priv_dup(fd);
         if (our_stdin < 0) /* no private fd available */
             our_stdin = dup_syscall(fd);
         if (our_stdin >= 0)
             fd_mark_close_on_exec(our_stdin);
         fd_table_add(our_stdin, 0);
         LOG(THREAD, LOG_TOP|LOG_SYSCALLS, 1,
             "WARNING: app is closing stdin=%d - duplicating descriptor for "
             "DynamoRIO usage got %d.\n", fd, our_stdin);
         if (privmod_stdin != NULL &&
             IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
             /* update the privately loaded libc's stdout _fileno. */
             (*privmod_stdin)->STDFILE_FILENO = our_stdin;
         }
     }
     return true;
 }

 /***************************************************************************/

 /* Used to obtain the pc of the syscall instr itself when the dcontext dc
  * is currently in a syscall handler.
  * Alternatively for sysenter we could set app_sysenter_instr_addr for Linux.
  */
 #define SYSCALL_PC(dc) \
  ((get_syscall_method() == SYSCALL_METHOD_INT ||     \
    get_syscall_method() == SYSCALL_METHOD_SYSCALL) ? \
   (ASSERT(SYSCALL_LENGTH == INT_LENGTH),             \
    POST_SYSCALL_PC(dc) - INT_LENGTH) :               \
   (vsyscall_syscall_end_pc - SYSENTER_LENGTH))

 static void
 handle_exit(dcontext_t *dcontext)
 {
     priv_mcontext_t *mc = get_mcontext(dcontext);
     bool exit_process = false;

     if (dcontext->sys_num == SYSNUM_EXIT_PROCESS) {
         /* We can have multiple thread groups within the same address space.
          * We need to know whether this is the only group left.
          * FIXME: we can have races where new threads are created after our
          * check: we'll live with that for now, but the right approach is to
          * suspend all threads via synch_with_all_threads(), do the check,
          * and if exit_process then exit w/o resuming: though have to
          * coordinate lock access w/ cleanup_and_terminate.
          * Xref i#94.  Xref PR 541760.
          */
         process_id_t mypid = get_process_id();
         thread_record_t **threads;
         int num_threads, i;
         exit_process = true;
         mutex_lock(&thread_initexit_lock);
         get_list_of_threads(&threads, &num_threads);
         for (i=0; i<num_threads; i++) {
             if (threads[i]->pid != mypid && !IS_CLIENT_THREAD(threads[i]->dcontext)) {
                 exit_process = false;
                 break;
             }
         }
         if (!exit_process) {
             /* We need to clean up the other threads in our group here. */
             thread_id_t myid = get_thread_id();
             priv_mcontext_t mcontext;
             DEBUG_DECLARE(thread_synch_result_t synch_res;)
             LOG(THREAD, LOG_TOP|LOG_SYSCALLS, 1,
                 "SYS_exit_group %d not final group: %d cleaning up just "
                 "threads in group\n", get_process_id(), get_thread_id());
             /* Set where we are to handle reciprocal syncs */
             copy_mcontext(mc, &mcontext);
             mc->pc = SYSCALL_PC(dcontext);
             for (i=0; i<num_threads; i++) {
                 if (threads[i]->id != myid && threads[i]->pid == mypid) {
                     /* See comments in dynamo_process_exit_cleanup(): we terminate
                      * to make cleanup easier, but may want to switch to shifting
                      * the target thread to a stack-free loop.
                      */
                     DEBUG_DECLARE(synch_res =)
                         synch_with_thread(threads[i]->id, true/*block*/,
                                           true/*have initexit lock*/,
                                           THREAD_SYNCH_VALID_MCONTEXT,
                                           THREAD_SYNCH_TERMINATED_AND_CLEANED,
                                           THREAD_SYNCH_SUSPEND_FAILURE_IGNORE);
                     /* initexit lock may be released and re-acquired in course of
                      * doing the synch so we may have races where the thread
                      * exits on its own (or new threads appear): we'll live
                      * with those for now.
                      */
                     ASSERT(synch_res == THREAD_SYNCH_RESULT_SUCCESS);
                 }
             }
             copy_mcontext(&mcontext, mc);
         }
         mutex_unlock(&thread_initexit_lock);
         global_heap_free(threads, num_threads*sizeof(thread_record_t*)
                          HEAPACCT(ACCT_THREAD_MGT));
     }

     if (is_last_app_thread() && !dynamo_exited) {
         LOG(THREAD, LOG_TOP|LOG_SYSCALLS, 1,
             "SYS_exit%s(%d) in final thread "TIDFMT" of "PIDFMT" => exiting DynamoRIO\n",
             (dcontext->sys_num == SYSNUM_EXIT_PROCESS) ? "_group" : "",
             MCXT_SYSNUM_REG(mc),
             get_thread_id(), get_process_id());
         /* we want to clean up even if not automatic startup! */
         automatic_startup = true;
         exit_process = true;
     } else {
         LOG(THREAD, LOG_TOP|LOG_THREADS|LOG_SYSCALLS, 1,
             "SYS_exit%s(%d) in thread "TIDFMT" of "PIDFMT" => cleaning up %s\n",
             (dcontext->sys_num == SYSNUM_EXIT_PROCESS) ? "_group" : "",
             MCXT_SYSNUM_REG(mc), get_thread_id(), get_process_id(),
             exit_process ? "process" : "thread");
     }
     KSTOP(num_exits_dir_syscall);

     cleanup_and_terminate(dcontext, MCXT_SYSNUM_REG(mc), sys_param(dcontext, 0),
                           sys_param(dcontext, 1), exit_process,
                           /* SYS_bsdthread_terminate has 2 more args */
                           sys_param(dcontext, 2), sys_param(dcontext, 3));
 }

 #ifdef LINUX /* XXX i#58: just until we have Mac support */
 static bool
 os_set_app_thread_area(dcontext_t *dcontext, our_modify_ldt_t *user_desc)
 {
 #ifdef X86
     int i;
     os_thread_data_t *ostd = dcontext->os_field;
     our_modify_ldt_t *desc = (our_modify_ldt_t *)ostd->app_thread_areas;

     if (user_desc->seg_not_present == 1) {
         /* find an empty one to update */
         for (i = 0; i < GDT_NUM_TLS_SLOTS; i++) {
             if (desc[i].seg_not_present == 1)
                 break;
         }
         if (i < GDT_NUM_TLS_SLOTS) {
             user_desc->entry_number = GDT_SELECTOR(i + tls_min_index());
             memcpy(&desc[i], user_desc, sizeof(*user_desc));
         } else
             return false;
     } else {
         /* If we used early injection, this might be ld.so trying to set up TLS.  We
          * direct the app to use the GDT entry we already set up for our private
          * libraries, but only the first time it requests TLS.
          */
         if (user_desc->entry_number == -1 && return_stolen_lib_tls_gdt) {
             mutex_lock(&set_thread_area_lock);
             if (return_stolen_lib_tls_gdt) {
                 uint selector = read_selector(LIB_SEG_TLS);
                 uint index = SELECTOR_INDEX(selector);
                 SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
                 return_stolen_lib_tls_gdt = false;
                 SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
                 user_desc->entry_number = index;
                 LOG(GLOBAL, LOG_THREADS, 2, "%s: directing app to use "
                     "selector 0x%x for first call to set_thread_area\n",
                     __FUNCTION__, selector);
             }
             mutex_unlock(&set_thread_area_lock);
         }

         /* update the specific one */
         i = user_desc->entry_number - tls_min_index();
         if (i < 0 || i >= GDT_NUM_TLS_SLOTS)
             return false;
         LOG(GLOBAL, LOG_THREADS, 2,
             "%s: change selector 0x%x base from "PFX" to "PFX"\n",
             __FUNCTION__, GDT_SELECTOR(user_desc->entry_number),
             desc[i].base_addr, user_desc->base_addr);
         memcpy(&desc[i], user_desc, sizeof(*user_desc));
     }
     /* if not conflict with dr's tls, perform the syscall */
     if (IF_CLIENT_INTERFACE_ELSE(!INTERNAL_OPTION(private_loader), true) &&
         GDT_SELECTOR(user_desc->entry_number) != read_selector(SEG_TLS) &&
         GDT_SELECTOR(user_desc->entry_number) != read_selector(LIB_SEG_TLS))
         return false;
 #elif defined(ARM)
     /* FIXME i#1551: NYI on ARM */
     ASSERT_NOT_IMPLEMENTED(false);
 #endif /* X86/ARM */
     return true;
 }

 static bool
 os_get_app_thread_area(dcontext_t *dcontext, our_modify_ldt_t *user_desc)
 {
 #ifdef X86
     os_thread_data_t *ostd = (os_thread_data_t *)dcontext->os_field;
     our_modify_ldt_t *desc = (our_modify_ldt_t *)ostd->app_thread_areas;
     int i = user_desc->entry_number - tls_min_index();
     if (i < 0 || i >= GDT_NUM_TLS_SLOTS)
         return false;
     if (desc[i].seg_not_present == 1)
         return false;
 #elif defined(ARM)
     /* FIXME i#1551: NYI on ARM */
     ASSERT_NOT_IMPLEMENTED(false);
 #endif /* X86/ARM */
     return true;
 }
 #endif

 /* This function is used for switch lib tls segment on creating thread.
  * We switch to app's lib tls seg before thread creation system call, i.e.
  * clone and vfork, and switch back to dr's lib tls seg after the system call.
  * They are only called on parent thread, not the child thread.
  * The child thread's tls is setup in os_tls_app_seg_init.
  */
 /* XXX: It looks like the Linux kernel has some dependency on the segment
  * descriptor. If using dr's segment descriptor, the created thread will have
  * access violation for tls not being setup. However, it works fine if we switch
  * the descriptor to app's segment descriptor before creating the thread.
  * We should be able to remove this function later if we find the problem.
  */
 static bool
 os_switch_lib_tls(dcontext_t *dcontext, bool to_app)
 {
     return os_switch_seg_to_context(dcontext, LIB_SEG_TLS, to_app);
 }

 static bool
 os_switch_seg_to_context(dcontext_t *dcontext, reg_id_t seg, bool to_app)
 {
     bool res = false;
 #ifdef X86
     app_pc base;
     os_local_state_t *os_tls = get_os_tls_from_dc(dcontext);

     /* we can only update the executing thread's segment (i#920) */
     ASSERT_MESSAGE(CHKLVL_ASSERTS+1/*expensive*/, "can only act on executing thread",
                    dcontext == get_thread_private_dcontext());
     ASSERT(IF_X86_ELSE((seg == SEG_FS || seg == SEG_GS),
                        (seg == DR_REG_TPIDRURW || DR_REG_TPIDRURO)));
     if (to_app) {
         base = os_get_app_seg_base(dcontext, seg);
     } else {
         base = os_get_dr_seg_base(dcontext, seg);
     }
     switch (os_tls->tls_type) {
 # ifdef X64
     case TLS_TYPE_ARCH_PRCTL: {
         res = tls_set_fs_gs_segment_base(os_tls->tls_type, seg, base, NULL);
         ASSERT(res);
         LOG(GLOBAL, LOG_THREADS, 2,
             "%s %s: arch_prctl successful for thread "TIDFMT" base "PFX"\n",
             __FUNCTION__, to_app ? "to app" : "to DR", get_thread_id(), base);
         if (seg == SEG_TLS && base == NULL) {
             /* Set the selector to 0 so we don't think TLS is available. */
             /* FIXME i#107: Still assumes app isn't using SEG_TLS. */
             reg_t zero = 0;
             WRITE_DR_SEG(zero);
         }
         break;
     }
 # endif
     case TLS_TYPE_GDT: {
         our_modify_ldt_t desc;
         uint index;
         uint selector;
         if (to_app) {
             selector = (seg == SEG_FS ? os_tls->app_fs : os_tls->app_gs);
             index = SELECTOR_INDEX(selector);
         } else {
             index = (seg == LIB_SEG_TLS ? tls_priv_lib_index() : tls_dr_index());
             ASSERT(index != -1 && "TLS indices not initialized");
             selector = GDT_SELECTOR(index);
         }
         if (selector != 0) {
             if (to_app) {
                 our_modify_ldt_t *areas =
                     ((os_thread_data_t *)dcontext->os_field)->app_thread_areas;
                 ASSERT((index >= tls_min_index()) &&
                        ((index - tls_min_index()) <= GDT_NUM_TLS_SLOTS));
                 desc = areas[index - tls_min_index()];
             } else {
                 tls_init_descriptor(&desc, base, GDT_NO_SIZE_LIMIT, index);
             }
             res = tls_set_fs_gs_segment_base(os_tls->tls_type, seg, NULL, &desc);
             ASSERT(res);
         } else {
             /* For a selector of zero, we just reset the segment to zero.  We
              * don't need to call set_thread_area.
              */
             res = true;  /* Indicate success. */
         }
         /* i558 update lib seg reg to enforce the segment changes */
         LOG(THREAD, LOG_LOADER, 2, "%s: switching to %s, setting %s to 0x%x\n",
             __FUNCTION__, (to_app ? "app" : "dr"), reg_names[seg], selector);
         WRITE_LIB_SEG(selector);
         LOG(THREAD, LOG_LOADER, 2,
             "%s %s: set_thread_area successful for thread "TIDFMT" base "PFX"\n",
             __FUNCTION__, to_app ? "to app" : "to DR", get_thread_id(), base);
         break;
     }
     case TLS_TYPE_LDT: {
         uint index;
         uint selector;
         /* XXX i#1285: added for MacOS private loader, but we don't
          * have enough other code to test this yet.
          */
         ASSERT_NOT_TESTED();
         if (to_app) {
             selector = (seg == SEG_FS ? os_tls->app_fs : os_tls->app_gs);
             index = SELECTOR_INDEX(selector);
         } else {
             index = (seg == LIB_SEG_TLS ? tls_priv_lib_index() : tls_dr_index());
             ASSERT(index != -1 && "TLS indices not initialized");
             selector = LDT_SELECTOR(index);
         }
         LOG(THREAD, LOG_LOADER, 2, "%s: switching to %s, setting %s to 0x%x\n",
             __FUNCTION__, (to_app ? "app" : "dr"), reg_names[seg], selector);
         WRITE_LIB_SEG(selector);
         LOG(THREAD, LOG_LOADER, 2,
             "%s %s: ldt selector swap successful for thread "TIDFMT"\n",
             __FUNCTION__, to_app ? "to app" : "to DR", get_thread_id());
         break;
     }
     default:
         ASSERT_NOT_REACHED();
         return false;
     }
     ASSERT(BOOLS_MATCH(to_app, os_using_app_state(dcontext)));
 #elif defined(ARM)
     /* FIXME i#1551: NYI on ARM */
     ASSERT_NOT_IMPLEMENTED(false);
 #endif /* X86/ARM */
     return res;
 }

 /* System call interception: put any special handling here
  * Arguments come from the pusha right before the call
  */

 /* WARNING: flush_fragments_and_remove_region assumes that pre and post system
  * call handlers do not examine or modify fcache or its fragments in any
  * way except for calling flush_fragments_and_remove_region!
  */

 /* WARNING: All registers are IN values, but NOT OUT values --
  * must set mcontext's register for that.
  */
 /* Returns false if system call should NOT be executed
  * Returns true if system call should go ahead
  */
 /* FIXME: split out specific handlers into separate routines
  */
 bool
 pre_system_call(dcontext_t *dcontext)
 {
     priv_mcontext_t *mc = get_mcontext(dcontext);
     bool execute_syscall = true;
     where_am_i_t old_whereami = dcontext->whereami;
     dcontext->whereami = WHERE_SYSCALL_HANDLER;
     /* FIXME We haven't yet done the work to detect which syscalls we
      * can determine a priori will fail. Once we do, we will set the
      * expect_last_syscall_to_fail to true for those case, and can
      * confirm in post_system_call() that the syscall failed as
      * expected.
      */
     DODEBUG(dcontext->expect_last_syscall_to_fail = false;);

     /* save key register values for post_system_call (they get clobbered
      * in syscall itself)
      */
     dcontext->sys_num = os_normalized_sysnum((int)MCXT_SYSNUM_REG(mc), NULL, dcontext);

     RSTATS_INC(pre_syscall);
     DOSTATS({
             if (ignorable_system_call_normalized(dcontext->sys_num))
             STATS_INC(pre_syscall_ignorable);
     });
     LOG(THREAD, LOG_SYSCALLS, 2, "system call %d\n", dcontext->sys_num);

 #if defined(LINUX) && defined(X86)
     /* PR 313715: If we fail to hook the vsyscall page (xref PR 212570, PR 288330)
      * we fall back on int, but we have to tweak syscall param #5 (ebp)
      * Once we have PR 288330 we can remove this.
      */
     if (should_syscall_method_be_sysenter() && !dcontext->sys_was_int) {
         dcontext->sys_xbp = mc->xbp;
         /* not using SAFE_READ due to performance concerns (we do this for
          * every single system call on systems where we can't hook vsyscall!)
          */
         TRY_EXCEPT(dcontext, /* try */ {
             mc->xbp = *(reg_t*)mc->xsp;
         }, /* except */ {
             ASSERT_NOT_REACHED();
             mc->xbp = 0;
         });
     }
 #endif

     switch (dcontext->sys_num) {

     case SYSNUM_EXIT_PROCESS:
 # if defined(LINUX) && VMX86_SERVER
         if (os_in_vmkernel_32bit()) {
             /* on esx 3.5 => ENOSYS, so wait for SYS_exit */
             LOG(THREAD, LOG_SYSCALLS, 2, "on esx35 => ignoring exitgroup\n");
             DODEBUG({ dcontext->expect_last_syscall_to_fail = true; });
             break;
         }
 #endif
         /* fall-through */
     case SYSNUM_EXIT_THREAD: {
         handle_exit(dcontext);
         break;
     }

     /****************************************************************************/
     /* MEMORY REGIONS */

 #if defined(LINUX) && !defined(X64)
     case SYS_mmap: {
         /* in /usr/src/linux/arch/i386/kernel/sys_i386.c:
            asmlinkage int old_mmap(struct mmap_arg_struct_t *arg)
          */
         mmap_arg_struct_t *arg = (mmap_arg_struct_t *) sys_param(dcontext, 0);
         mmap_arg_struct_t arg_buf;
         if (safe_read(arg, sizeof(mmap_arg_struct_t), &arg_buf)) {
             void *addr = (void *) arg->addr;
             size_t len = (size_t) arg->len;
             uint prot = (uint) arg->prot;
             LOG(THREAD, LOG_SYSCALLS, 2,
                 "syscall: mmap addr="PFX" size="PIFX" prot=0x%x"
                 " flags="PIFX" offset="PIFX" fd=%d\n",
                 addr, len, prot, arg->flags, arg->offset, arg->fd);
             /* Check for overlap with existing code or patch-proof regions */
             if (addr != NULL &&
                 !app_memory_pre_alloc(dcontext, addr, len, osprot_to_memprot(prot),
                                       !TEST(MAP_FIXED, arg->flags))) {
                 /* Rather than failing or skipping the syscall we'd like to just
                  * remove the hint -- but we don't want to write to app memory, so
                  * we do fail.  We could set up our own mmap_arg_struct_t but
                  * we'd need dedicate per-thread storage, and SYS_mmap is obsolete.
                  */
                 execute_syscall = false;
                 set_failure_return_val(dcontext, ENOMEM);
                 DODEBUG({ dcontext->expect_last_syscall_to_fail = true; });
                 break;
             }
         }
         /* post_system_call does the work */
         dcontext->sys_param0 = (reg_t) arg;
         break;
     }
 #endif
     case IF_MACOS_ELSE(SYS_mmap,IF_X64_ELSE(SYS_mmap,SYS_mmap2)): {
         /* in /usr/src/linux/arch/i386/kernel/sys_i386.c:
            asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
              unsigned long prot, unsigned long flags,
              unsigned long fd, unsigned long pgoff)
          */
         void *addr = (void *) sys_param(dcontext, 0);
         size_t len = (size_t) sys_param(dcontext, 1);
         uint prot = (uint) sys_param(dcontext, 2);
         uint flags = (uint) sys_param(dcontext, 3);
         LOG(THREAD, LOG_SYSCALLS, 2,
             "syscall: mmap2 addr="PFX" size="PIFX" prot=0x%x"
             " flags="PIFX" offset="PIFX" fd=%d\n",
             addr, len, prot, flags,
             sys_param(dcontext, 5), sys_param(dcontext, 4));
         /* Check for overlap with existing code or patch-proof regions */
         if (addr != NULL &&
             !app_memory_pre_alloc(dcontext, addr, len, osprot_to_memprot(prot),
                                   !TEST(MAP_FIXED, flags))) {
             if (!TEST(MAP_FIXED, flags)) {
                 /* Rather than failing or skipping the syscall we just remove
                  * the hint which should eliminate any overlap.
                  */
                 *sys_param_addr(dcontext, 0) = 0;
             } else {
                 execute_syscall = false;
                 set_failure_return_val(dcontext, ENOMEM);
                 DODEBUG({ dcontext->expect_last_syscall_to_fail = true; });
                 break;
             }
         }
         /* post_system_call does the work */
         dcontext->sys_param0 = (reg_t) addr;
         dcontext->sys_param1 = len;
         dcontext->sys_param2 = prot;
         dcontext->sys_param3 = flags;
         break;
     }
     /* must flush stale fragments when we see munmap/mremap */
     case SYS_munmap: {
         /* in /usr/src/linux/mm/mmap.c:
            asmlinkage long sys_munmap(unsigned long addr, uint len)
          */
         app_pc addr = (void *) sys_param(dcontext, 0);
         size_t len = (size_t) sys_param(dcontext, 1);
         LOG(THREAD, LOG_SYSCALLS, 2, "syscall: munmap addr="PFX" size="PFX"\n",
             addr, len);
         RSTATS_INC(num_app_munmaps);
         /* FIXME addr is supposed to be on a page boundary so we
          * could detect that condition here and set
          * expect_last_syscall_to_fail.
          */
         /* save params in case an undo is needed in post_system_call */
         dcontext->sys_param0 = (reg_t) addr;
         dcontext->sys_param1 = len;
         /* We assume that the unmap will succeed and so are conservative
          * and remove the region from exec areas and flush all fragments
          * prior to issuing the syscall. If the unmap fails, we try to
          * recover in post_system_call() by re-adding the region. This
          * approach has its shortcomings -- see comments below in
          * post_system_call().
          */
         /* Check for unmapping a module. */
         os_get_module_info_lock();
         if (module_overlaps(addr, len)) {
             /* FIXME - handle unmapping more than one module at once, or only unmapping
              * part of a module (for which case should adjust view size? or treat as full
              * unmap?). Theoretical for now as we haven't seen this. */
             module_area_t *ma = module_pc_lookup(addr);
             ASSERT_CURIOSITY(ma != NULL);
             ASSERT_CURIOSITY(addr == ma->start);
             /* XREF 307599 on rounding module end to the next PAGE boundary */
             ASSERT_CURIOSITY((app_pc)ALIGN_FORWARD(addr+len, PAGE_SIZE) == ma->end);
             os_get_module_info_unlock();
             /* i#210:
              * we only think a module is removed if its first memory region
              * is unloaded (unmapped).
              * XREF i#160 to fix the real problem of handling module splitting.
              */
             if (ma != NULL && ma->start == addr)
                 module_list_remove(addr, ALIGN_FORWARD(len, PAGE_SIZE));
         } else
             os_get_module_info_unlock();
         app_memory_deallocation(dcontext, (app_pc)addr, len,
                                 false /* don't own thread_initexit_lock */,
                                 true /* image, FIXME: though not necessarily */);
         /* FIXME: case 4983 use is_elf_so_header() */
 #ifndef HAVE_MEMINFO_QUERY
         memcache_lock();
         memcache_remove(addr, addr + len);
         memcache_unlock();
 #endif
         break;
     }
 #ifdef LINUX
     case SYS_mremap: {
         /* in /usr/src/linux/mm/mmap.c:
            asmlinkage unsigned long sys_mremap(unsigned long addr,
              unsigned long old_len, unsigned long new_len,
              unsigned long flags, unsigned long new_addr)
         */
         dr_mem_info_t info;
         app_pc addr = (void *) sys_param(dcontext, 0);
         size_t old_len = (size_t) sys_param(dcontext, 1);
         size_t new_len = (size_t) sys_param(dcontext, 2);
         DEBUG_DECLARE(bool ok;)
         LOG(THREAD, LOG_SYSCALLS, 2, "syscall: mremap addr="PFX" size="PFX"\n",
             addr, old_len);
         /* post_system_call does the work */
         dcontext->sys_param0 = (reg_t) addr;
         dcontext->sys_param1 = old_len;
         dcontext->sys_param2 = new_len;
         /* i#173
          * we need memory type and prot to set the
          * new memory region in the post_system_call
          */
         DEBUG_DECLARE(ok =)
             query_memory_ex(addr, &info);
         ASSERT(ok);
         dcontext->sys_param3 = info.prot;
         dcontext->sys_param4 = info.type;
         DOCHECK(1, {
             /* we don't expect to see remappings of modules */
             os_get_module_info_lock();
             ASSERT_CURIOSITY(!module_overlaps(addr, old_len));
             os_get_module_info_unlock();
         });
         break;
     }
 #endif
     case SYS_mprotect: {
         /* in /usr/src/linux/mm/mprotect.c:
            asmlinkage long sys_mprotect(unsigned long start, uint len,
            unsigned long prot)
         */
         uint res;
         DEBUG_DECLARE(size_t size;)
         app_pc addr  = (void *) sys_param(dcontext, 0);
         size_t len  = (size_t) sys_param(dcontext, 1);
         uint prot = (uint) sys_param(dcontext, 2);
         uint new_memprot;
         /* save params in case an undo is needed in post_system_call */
         dcontext->sys_param0 = (reg_t) addr;
         dcontext->sys_param1 = len;
         dcontext->sys_param2 = prot;
         LOG(THREAD, LOG_SYSCALLS, 2,
             "syscall: mprotect addr="PFX" size="PFX" prot=%s\n",
             addr, len, memprot_string(osprot_to_memprot(prot)));

         /* PR 413109 - fail mprotect if start region is unknown; seen in hostd.
          * FIXME: get_memory_info_from_os() should be used instead of
          *      vmvector_lookup_data() to catch mprotect failure cases on shared
          *      memory allocated by another process.  However, till PROC_MAPS
          *      are implemented on visor, get_memory_info_from_os() can't
          *      distinguish between inaccessible and unallocated, so it doesn't
          *      work.  Once PROC_MAPS is available on visor use
          *      get_memory_info_from_os() and resolve case.
          *
          * FIXME: Failing mprotect if addr isn't allocated doesn't help if there
          *      are unallocated pages in the middle of the the mprotect region.
          *      As it will be expensive to do page wise check for each mprotect
          *      syscall just to guard against a corner case, it might be better
          *      to let the system call fail and recover in post_system_call().
          *      See PR 410921.
          */
         if (!get_memory_info(addr, NULL, IF_DEBUG_ELSE(&size, NULL), NULL)) {
             LOG(THREAD, LOG_SYSCALLS, 2,
                 "\t"PFX" isn't mapped; aborting mprotect\n", addr);
             execute_syscall = false;
             set_failure_return_val(dcontext, ENOMEM);
             DODEBUG({ dcontext->expect_last_syscall_to_fail = true; });
             break;
         } else {
             /* If mprotect region spans beyond the end of the vmarea then it
              * spans 2 or more vmareas with dissimilar protection (xref
              * PR 410921) or has unallocated regions in between (PR 413109).
              */
             DOCHECK(1, dcontext->mprot_multi_areas = len > size ? true : false;);
         }

         res = app_memory_protection_change(dcontext, addr, len,
                                            osprot_to_memprot(prot),
                                            &new_memprot, NULL);
         if (res != DO_APP_MEM_PROT_CHANGE) {
             if (res == FAIL_APP_MEM_PROT_CHANGE) {
                 ASSERT_NOT_IMPLEMENTED(false); /* return code? */
             } else {
                 ASSERT_NOT_IMPLEMENTED(res != SUBSET_APP_MEM_PROT_CHANGE);
                 ASSERT_NOT_REACHED();
             }
             execute_syscall = false;
         }
         else {
             /* FIXME Store state for undo if the syscall fails. */
             IF_NO_MEMQUERY(memcache_update_locked(addr, addr + len,
                                                   osprot_to_memprot(prot),
                                                   -1/*type unchanged*/, true/*exists*/));
         }
         break;
     }
 #ifdef LINUX
     case SYS_brk: {
         /* i#91/PR 396352: need to watch SYS_brk to maintain all_memory_areas.
          * We store the old break in the param1 slot.
          */
         DODEBUG(dcontext->sys_param0 = (reg_t) sys_param(dcontext, 0););
         dcontext->sys_param1 = dynamorio_syscall(SYS_brk, 1, 0);
         break;
     }
     case SYS_uselib: {
         /* Used to get the kernel to load a share library (legacy system call).
          * Was primarily used when statically linking to dynamically loaded shared
          * libraries that were loaded at known locations.  Shouldn't be used by
          * applications using the dynamic loader (ld) which is currently the only
          * way we can inject so we don't expect to see this.  PR 307621. */
         ASSERT_NOT_IMPLEMENTED(false);
         break;
     }
 #endif

     /****************************************************************************/
     /* SPAWNING */

 #ifdef LINUX
     case SYS_clone: {
         /* in /usr/src/linux/arch/i386/kernel/process.c
          * 32-bit params: flags, newsp, ptid, tls, ctid
          * 64-bit params: should be the same yet tls (for ARCH_SET_FS) is in r8?!?
          *   I don't see how sys_clone gets its special args: shouldn't it
          *   just get pt_regs as a "special system call"?
          *   sys_clone(unsigned long clone_flags, unsigned long newsp,
          *     void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
          */
         uint flags = (uint) sys_param(dcontext, 0);
         LOG(THREAD, LOG_SYSCALLS, 2, "syscall: clone with flags = "PFX"\n", flags);
         LOG(THREAD, LOG_SYSCALLS, 2, "args: "PFX", "PFX", "PFX", "PFX", "PFX"\n",
             sys_param(dcontext, 0), sys_param(dcontext, 1), sys_param(dcontext, 2),
             sys_param(dcontext, 3), sys_param(dcontext, 4));
         handle_clone(dcontext, flags);
         if ((flags & CLONE_VM) == 0) {
             LOG(THREAD, LOG_SYSCALLS, 1, "\tWARNING: CLONE_VM not set!\n");
         }
         /* save for post_system_call */
         dcontext->sys_param0 = (reg_t) flags;

         /* i#1010: If we have private fds open (usually logfiles), we should
          * clean those up before they get reused by a new thread.
          * XXX: Ideally we'd do this in fd_table_add(), but we can't acquire
          * thread_initexit_lock there.
          */
         cleanup_after_vfork_execve(dcontext);

         /* For thread creation clone syscalls a clone_record_t structure
          * containing the pc after the app's syscall instr and other data
          * (see i#27) is placed at the bottom of the dstack (which is allocated
          * by create_clone_record() - it also saves app stack and switches
          * to dstack).  xref i#149/PR 403015.
          * Note: This must be done after sys_param0 is set.
          */
         if (is_thread_create_syscall(dcontext)) {
             create_clone_record(dcontext, sys_param_addr(dcontext, 1) /*newsp*/);
             /* We switch the lib tls segment back to app's segment.
              * Please refer to comment on os_switch_lib_tls.
              */
             if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
                 os_switch_lib_tls(dcontext, true/*to app*/);
             }
         } else  /* This is really a fork. */
             os_fork_pre(dcontext);
         break;
     }
 #elif defined(MACOS)
     case SYS_bsdthread_create: {
         /* XXX i#1403: we need earlier injection to intercept
          * bsdthread_register in order to capture workqueue threads.
          * For now we settle for intercepting bsd threads at the user thread func.
          * We miss a little user-mode code but this is enough to get started.
          */
         app_pc func = (app_pc) sys_param(dcontext, 0);
         void *func_arg = (void *) sys_param(dcontext, 1);
         void *clone_rec;
         LOG(THREAD, LOG_SYSCALLS, 1, "bsdthread_create: thread func "PFX", arg "PFX"\n",
             func, func_arg);
         handle_clone(dcontext, CLONE_THREAD | CLONE_VM | CLONE_SIGHAND | SIGCHLD);
         clone_rec = create_clone_record(dcontext, NULL, func, func_arg);
         dcontext->sys_param0 = (reg_t) func;
         dcontext->sys_param1 = (reg_t) func_arg;
         *sys_param_addr(dcontext, 0) = (reg_t) new_bsdthread_intercept;
         *sys_param_addr(dcontext, 1) = (reg_t) clone_rec;
         break;
     }
 #endif

     case SYS_vfork: {
         /* treat as if sys_clone with flags just as sys_vfork does */
         /* in /usr/src/linux/arch/i386/kernel/process.c */
         uint flags = CLONE_VFORK | CLONE_VM | SIGCHLD;
         LOG(THREAD, LOG_SYSCALLS, 2, "syscall: vfork\n");
         handle_clone(dcontext, flags);
         cleanup_after_vfork_execve(dcontext);

         /* save for post_system_call, treated as if SYS_clone */
         dcontext->sys_param0 = (reg_t) flags;

         /* vfork has the same needs as clone.  Pass info via a clone_record_t
          * structure to child.  See SYS_clone for info about i#149/PR 403015.
          */
         IF_LINUX(ASSERT(is_thread_create_syscall(dcontext)));
         dcontext->sys_param1 = mc->xsp; /* for restoring in parent */
 #ifdef MACOS
         create_clone_record(dcontext, (reg_t *)&mc->xsp, NULL, NULL);
 #else
         create_clone_record(dcontext, (reg_t *)&mc->xsp /*child uses parent sp*/);
 #endif

         /* We switch the lib tls segment back to app's segment.
          * Please refer to comment on os_switch_lib_tls.
          */
         if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
             os_switch_lib_tls(dcontext, true/*to app*/);
         }
         break;
     }

     case SYS_fork: {
         LOG(THREAD, LOG_SYSCALLS, 2, "syscall: fork\n");
         os_fork_pre(dcontext);
         break;
     }

     case SYS_execve: {
         handle_execve(dcontext);
         break;
     }

     /****************************************************************************/
     /* SIGNALS */

     case IF_MACOS_ELSE(SYS_sigaction,SYS_rt_sigaction): {   /* 174 */
         /* in /usr/src/linux/kernel/signal.c:
            asmlinkage long
            sys_rt_sigaction(int sig, const struct sigaction *act,
              struct sigaction *oact, size_t sigsetsize)
          */
         int sig  = (int) sys_param(dcontext, 0);
         const kernel_sigaction_t *act = (const kernel_sigaction_t *) sys_param(dcontext, 1);
         kernel_sigaction_t *oact = (kernel_sigaction_t *) sys_param(dcontext, 2);
         size_t sigsetsize = (size_t) sys_param(dcontext, 3);
         /* post_syscall does some work as well */
         dcontext->sys_param0 = (reg_t) sig;
         dcontext->sys_param1 = (reg_t) act;
         dcontext->sys_param2 = (reg_t) oact;
         dcontext->sys_param3 = (reg_t) sigsetsize;
         execute_syscall = handle_sigaction(dcontext, sig, act, oact, sigsetsize);
         if (!execute_syscall) {
             set_success_return_val(dcontext, 0);
         }
         break;
     }
 #if defined(LINUX) && !defined(X64)
     case SYS_sigreturn: {      /* 119 */
         /* in /usr/src/linux/arch/i386/kernel/signal.c:
            asmlinkage int sys_sigreturn(unsigned long __unused)
          */
         execute_syscall = handle_sigreturn(dcontext, false);
         /* app will not expect syscall to return, so when handle_sigreturn
          * returns false it always redirects the context, and thus no
          * need to set return val here.
          */
         break;
     }
 #endif
 #ifdef LINUX
     case SYS_rt_sigreturn: {   /* 173 */
         /* in /usr/src/linux/arch/i386/kernel/signal.c:
            asmlinkage int sys_rt_sigreturn(unsigned long __unused)
          */
         execute_syscall = handle_sigreturn(dcontext, true);
         /* see comment for SYS_sigreturn on return val */
         break;
     }
 #endif
 #ifdef MACOS
     case SYS_sigreturn: {
         /* int sigreturn(struct ucontext *uctx, int infostyle) */
         execute_syscall = handle_sigreturn(dcontext, (void *) sys_param(dcontext, 0),
                                            (int) sys_param(dcontext, 1));
         /* see comment for SYS_sigreturn on return val */
         break;
     }
 #endif
     case SYS_sigaltstack: {    /* 186 */
         /* in /usr/src/linux/arch/i386/kernel/signal.c:
            asmlinkage int
            sys_sigaltstack(const stack_t *uss, stack_t *uoss)
         */
         const stack_t *uss = (const stack_t *) sys_param(dcontext, 0);
         stack_t *uoss = (stack_t *) sys_param(dcontext, 1);
         execute_syscall =
             handle_sigaltstack(dcontext, uss, uoss);
         if (!execute_syscall) {
             set_success_return_val(dcontext, 0);
         }
         break;
     }
     case IF_MACOS_ELSE(SYS_sigprocmask,SYS_rt_sigprocmask): { /* 175 */
         /* in /usr/src/linux/kernel/signal.c:
            asmlinkage long
            sys_rt_sigprocmask(int how, sigset_t *set, sigset_t *oset,
              size_t sigsetsize)
          */
         /* we also need access to the params in post_system_call */
         dcontext->sys_param0 = sys_param(dcontext, 0);
         dcontext->sys_param1 = sys_param(dcontext, 1);
         dcontext->sys_param2 = sys_param(dcontext, 2);
         dcontext->sys_param3 = sys_param(dcontext, 3);
         execute_syscall =
             handle_sigprocmask(dcontext, (int) sys_param(dcontext, 0),
                                (kernel_sigset_t *) sys_param(dcontext, 1),
                                (kernel_sigset_t *) sys_param(dcontext, 2),
                                (size_t) sys_param(dcontext, 3));
         if (!execute_syscall)
             set_success_return_val(dcontext, 0);
         break;
     }
 #ifdef MACOS
     case SYS_sigsuspend_nocancel:
 #endif
     case IF_MACOS_ELSE(SYS_sigsuspend,SYS_rt_sigsuspend): { /* 179 */
         /* in /usr/src/linux/kernel/signal.c:
            asmlinkage int
            sys_rt_sigsuspend(sigset_t *unewset, size_t sigsetsize)
          */
         handle_sigsuspend(dcontext, (kernel_sigset_t *) sys_param(dcontext, 0),
                           (size_t) sys_param(dcontext, 1));
         break;
     }
 #ifdef LINUX
     case SYS_signalfd:         /* 282/321 */
     case SYS_signalfd4: {      /* 289 */
         /* int signalfd (int fd, const sigset_t *mask, size_t sizemask) */
         /* int signalfd4(int fd, const sigset_t *mask, size_t sizemask, int flags) */
         ptr_int_t new_result;
         dcontext->sys_param0 = sys_param(dcontext, 0);
         dcontext->sys_param1 = sys_param(dcontext, 1);
         dcontext->sys_param2 = sys_param(dcontext, 2);
         if (dcontext->sys_num == SYS_signalfd)
             dcontext->sys_param3 = 0;
         else
             dcontext->sys_param3 = sys_param(dcontext, 3);
         new_result =
             handle_pre_signalfd(dcontext, (int) dcontext->sys_param0,
                                 (kernel_sigset_t *) dcontext->sys_param1,
                                 (size_t) dcontext->sys_param2,
                                 (int) dcontext->sys_param3);
         execute_syscall = false;
         /* since non-Mac, we can use this even if the call failed */
         set_success_return_val(dcontext, new_result);
         break;
     }
 #endif
     case SYS_kill: {           /* 37 */
         /* in /usr/src/linux/kernel/signal.c:
          * asmlinkage long sys_kill(int pid, int sig)
          */
         pid_t pid = (pid_t) sys_param(dcontext, 0);
         uint sig = (uint) sys_param(dcontext, 1);
         LOG(GLOBAL, LOG_TOP|LOG_SYSCALLS, 2,
             "thread "TIDFMT" sending signal %d to pid "PIDFMT"\n",
             get_thread_id(), sig, pid);
         /* We check whether targeting this process or this process group */
         if (pid == get_process_id() || pid == 0 || pid == -get_process_group_id()) {
             handle_self_signal(dcontext, sig);
         }
         break;
     }
 #if defined(SYS_tkill)
     case SYS_tkill: {          /* 238 */
         /* in /usr/src/linux/kernel/signal.c:
          * asmlinkage long sys_tkill(int pid, int sig)
          */
         pid_t tid = (pid_t) sys_param(dcontext, 0);
         uint sig = (uint) sys_param(dcontext, 1);
         LOG(GLOBAL, LOG_TOP|LOG_SYSCALLS, 2,
             "thread "TIDFMT" sending signal %d to tid %d\n",
             get_thread_id(), sig, tid);
         if (tid == get_thread_id()) {
             handle_self_signal(dcontext, sig);
         }
         break;
     }
 #endif
 #if defined(SYS_tgkill)
     case SYS_tgkill: {          /* 270 */
         /* in /usr/src/linux/kernel/signal.c:
          * asmlinkage long sys_tgkill(int tgid, int pid, int sig)
          */
         pid_t tgid = (pid_t) sys_param(dcontext, 0);
         pid_t tid = (pid_t) sys_param(dcontext, 1);
         uint sig = (uint) sys_param(dcontext, 2);
         LOG(GLOBAL, LOG_TOP|LOG_SYSCALLS, 2,
             "thread "TIDFMT" sending signal %d to tid %d tgid %d\n",
             get_thread_id(), sig, tid, tgid);
         /* some kernels support -1 values:
          +   tgkill(-1, tid, sig)  == tkill(tid, sig)
          *   tgkill(tgid, -1, sig) == kill(tgid, sig)
          * the 2nd was proposed but is not in 2.6.20 so I'm ignoring it, since
          * I don't want to kill the thread when the signal is never sent!
          * FIXME: the 1st is in my tkill manpage, but not my 2.6.20 kernel sources!
          */
         if ((tgid == -1 || tgid == get_process_id()) &&
             tid == get_thread_id()) {
             handle_self_signal(dcontext, sig);
         }
         break;
     }
 #endif
     case SYS_setitimer:       /* 104 */
         dcontext->sys_param0 = sys_param(dcontext, 0);
         dcontext->sys_param1 = sys_param(dcontext, 1);
         dcontext->sys_param2 = sys_param(dcontext, 2);
         handle_pre_setitimer(dcontext, (int) sys_param(dcontext, 0),
                              (const struct itimerval *) sys_param(dcontext, 1),
                              (struct itimerval *) sys_param(dcontext, 2));
         break;
     case SYS_getitimer:      /* 105 */
         dcontext->sys_param0 = sys_param(dcontext, 0);
         dcontext->sys_param1 = sys_param(dcontext, 1);
         break;
 #ifdef LINUX
     case SYS_alarm: /* 27 on x86 and 37 on x64 */
         dcontext->sys_param0 = sys_param(dcontext, 0);
         handle_pre_alarm(dcontext, (unsigned int) dcontext->sys_param0);
         break;
 #endif
 #if 0
 # ifndef X64
     case SYS_signal: {         /* 48 */
         /* in /usr/src/linux/kernel/signal.c:
            asmlinkage unsigned long
            sys_signal(int sig, __sighandler_t handler)
          */
         break;
     }
     case SYS_sigaction: {      /* 67 */
         /* in /usr/src/linux/arch/i386/kernel/signal.c:
            asmlinkage int
            sys_sigaction(int sig, const struct old_sigaction *act,
                          struct old_sigaction *oact)
          */
         break;
     }
     case SYS_sigsuspend: {     /* 72 */
         /* in /usr/src/linux/arch/i386/kernel/signal.c:
            asmlinkage int
            sys_sigsuspend(int history0, int history1, old_sigset_t mask)
          */
         break;
     }
     case SYS_sigprocmask: {    /* 126 */
         /* in /usr/src/linux/kernel/signal.c:
            asmlinkage long
            sys_sigprocmask(int how, old_sigset_t *set, old_sigset_t *oset)
          */
         break;
     }
 # endif
 #else
     /* until we've implemented them, keep down here to get warning: */
 # if defined(LINUX) && !defined(X64)
     case SYS_signal:
     case SYS_sigaction:
     case SYS_sigsuspend:
     case SYS_sigprocmask:
 # endif
 #endif

 #if defined(LINUX) && !defined(X64)
     case SYS_sigpending:      /* 73 */
     case SYS_sgetmask:        /* 68 */
     case SYS_ssetmask:        /* 69 */
 #endif
 #ifdef LINUX
     case SYS_rt_sigtimedwait: /* 177 */
     case SYS_rt_sigqueueinfo: /* 178 */
 #endif
     case IF_MACOS_ELSE(SYS_sigpending,SYS_rt_sigpending): { /* 176 */
         /* FIXME: handle all of these syscalls! */
         LOG(THREAD, LOG_ASYNCH|LOG_SYSCALLS, 1,
             "WARNING: unhandled signal system call %d\n", dcontext->sys_num);
         break;
     }

     /****************************************************************************/
     /* FILES */
     /* prevent app from closing our files or opening a new file in our fd space.
      * it's not worth monitoring all syscalls that take in fds from affecting ours.
      */

 #ifdef MACOS
     case SYS_close_nocancel:
 #endif
     case SYS_close: {
         execute_syscall = handle_close_pre(dcontext);
 #ifdef LINUX
         if (execute_syscall)
             signal_handle_close(dcontext, (file_t) sys_param(dcontext, 0));
 #endif
         break;
     }

     case SYS_dup2:
     IF_LINUX(case SYS_dup3:) {
         file_t newfd = (file_t) sys_param(dcontext, 1);
         if (fd_is_dr_owned(newfd) || fd_is_in_private_range(newfd)) {
             SYSLOG_INTERNAL_WARNING_ONCE("app trying to dup-close DR file(s)");
             LOG(THREAD, LOG_TOP|LOG_SYSCALLS, 1,
                 "WARNING: app trying to dup2/dup3 to %d.  Disallowing.\n", newfd);
             set_failure_return_val(dcontext, EBADF);
             DODEBUG({ dcontext->expect_last_syscall_to_fail = true; });
             execute_syscall = false;
         }
         break;
     }

 #ifdef MACOS
     case SYS_fcntl_nocancel:
 #endif
     case SYS_fcntl: {
         int cmd = (int) sys_param(dcontext, 1);
         long arg = (long) sys_param(dcontext, 2);
         /* we only check for asking for min in private space: not min below
          * but actual will be above (see notes in os_file_init())
          */
         if ((cmd == F_DUPFD || cmd == F_DUPFD_CLOEXEC) && fd_is_in_private_range(arg)) {
             SYSLOG_INTERNAL_WARNING_ONCE("app trying to open private fd(s)");
             LOG(THREAD, LOG_TOP|LOG_SYSCALLS, 1,
                 "WARNING: app trying to dup to >= %d.  Disallowing.\n", arg);
             set_failure_return_val(dcontext, EINVAL);
             DODEBUG({ dcontext->expect_last_syscall_to_fail = true; });
             execute_syscall = false;
         } else {
             dcontext->sys_param0 = sys_param(dcontext, 0);
             dcontext->sys_param1 = cmd;
         }
         break;
     }

     case SYS_getrlimit: {
         /* save for post */
         dcontext->sys_param0 = sys_param(dcontext, 0);
         dcontext->sys_param1 = sys_param(dcontext, 1);
         break;
     }

     case SYS_setrlimit: {
         int resource = (int) sys_param(dcontext, 0);
         if (resource == RLIMIT_NOFILE && DYNAMO_OPTION(steal_fds) > 0) {
             /* don't let app change limits as that would mess up our fd space */
             set_failure_return_val(dcontext, EPERM);
             DODEBUG({ dcontext->expect_last_syscall_to_fail = true; });
             execute_syscall = false;
         }
         break;
     }

     /* i#107 syscalls that might change/query app's segment */
 #ifdef LINUX
 # ifdef X64
     case SYS_arch_prctl: {
         /* we handle arch_prctl in post_syscall */
         dcontext->sys_param0 = sys_param(dcontext, 0);
         dcontext->sys_param1 = sys_param(dcontext, 1);
         break;
     }
 # endif
     case SYS_set_thread_area: {
         our_modify_ldt_t desc;
         if (INTERNAL_OPTION(mangle_app_seg) &&
             safe_read((void *)sys_param(dcontext, 0),
                       sizeof(desc), &desc)) {
             if (os_set_app_thread_area(dcontext, &desc) &&
                 safe_write_ex((void *)sys_param(dcontext, 0),
                               sizeof(desc), &desc, NULL)) {
                 /* check if the range is unlimited */
                 ASSERT_CURIOSITY(desc.limit == 0xfffff);
                 execute_syscall = false;
                 set_success_return_val(dcontext, 0);
             }
         }
         break;
     }
     case SYS_get_thread_area: {
         our_modify_ldt_t desc;
         if (INTERNAL_OPTION(mangle_app_seg) &&
             safe_read((const void *)sys_param(dcontext, 0),
                       sizeof(desc), &desc)) {
             if (os_get_app_thread_area(dcontext, &desc) &&
                 safe_write_ex((void *)sys_param(dcontext, 0),
                               sizeof(desc), &desc, NULL)) {
                 execute_syscall = false;
                 set_success_return_val(dcontext, 0);
             }
         }
         break;
     }
 #elif defined(MACOS)
     /* FIXME i#58: handle i386_{get,set}_ldt and thread_fast_set_cthread_self64 */
 #endif

 #ifdef DEBUG
 # ifdef MACOS
     case SYS_open_nocancel:
 # endif
     case SYS_open: {
         dcontext->sys_param0 = sys_param(dcontext, 0);
         break;
     }
 #endif

     default: {
 #ifdef VMX86_SERVER
         if (is_vmkuw_sysnum(dcontext->sys_num)) {
             execute_syscall = vmkuw_pre_system_call(dcontext);
             break;
         }
 #endif
     }

     } /* end switch */

     dcontext->whereami = old_whereami;
     return execute_syscall;
 }

 void
 all_memory_areas_lock(void)
 {
     IF_NO_MEMQUERY(memcache_lock());
 }

 void
 all_memory_areas_unlock(void)
 {
     IF_NO_MEMQUERY(memcache_unlock());
 }

 void
 update_all_memory_areas(app_pc start, app_pc end, uint prot, int type)
 {
     IF_NO_MEMQUERY(memcache_update(start, end, prot, type));
 }

 bool
 remove_from_all_memory_areas(app_pc start, app_pc end)
 {
     IF_NO_MEMQUERY(return memcache_remove(start, end));
     return true;
 }

 /* We consider a module load to happen at the first mmap, so we check on later
  * overmaps to ensure things look consistent. */
 static bool
 mmap_check_for_module_overlap(app_pc base, size_t size, bool readable, uint64 inode,
                               bool at_map)
 {
     module_area_t *ma;

     os_get_module_info_lock();
     ma = module_pc_lookup(base);
     if (ma != NULL) {
         /* FIXME - how can we distinguish between the loader mapping the segments
          * over the initial map from someone just mapping over part of a module? If
          * is the latter case need to adjust the view size or remove from module list. */
         LOG(GLOBAL, LOG_VMAREAS, 2, "%s mmap overlapping module area : \n"
             "\tmap : base="PFX" base+size="PFX" inode="UINT64_FORMAT_STRING"\n"
             "\tmod : start="PFX" end="PFX" inode="UINT64_FORMAT_STRING"\n",
             at_map ? "new" : "existing", base, base+size, inode,
             ma->start, ma->end, ma->names.inode);
         ASSERT_CURIOSITY(base >= ma->start);
         if (at_map) {
             ASSERT_CURIOSITY(base+size <= ma->end);
         } else {
             /* FIXME - I'm having problems with this check for existing maps.  I
              * haven't been able to get gdb to break in early enough to really get a good
              * look at the early loader behavior.  Two issues:  One case is with our .so
              * for which the anonymous .bss mapping is one page larger than expected
              * (which might be some loader bug in the size calculation? or something? if
              * so should see it trigger the at_map curiosity on some dll and can address
              * then) and the other is that for a few executables the .bss mapping is much
              * larger (~0x20000 larger) then expected when running under DR (but not
              * running natively where it is instead the expected size).  Both could just
              * be the loader merging adjacent identically protected regions though I
              * can't explain the discrepancy between DR and native given that our vmmheap
              * is elsewhere in the address space (so who and how allocated that adjacent
              * memory). I've yet to see any issue with dynamically loaded modules so
              * it's probably the loader merging regions.  Still worth investigating. */
             ASSERT_CURIOSITY(inode == 0 /*see above comment*/||
                              module_contains_addr(ma, base+size-1));
         }
         ASSERT_CURIOSITY(ma->names.inode == inode || inode == 0 /* for .bss */);
         DOCHECK(1, {
             if (readable && module_is_header(base, size)) {
                 /* Case 8879: For really small modules, to save disk space, the same
                  * disk page could hold both RO and .data, occupying just 1 page of
                  * disk space, e.g. /usr/lib/httpd/modules/mod_auth_anon.so.  When
                  * such a module is mapped in, the os maps the same disk page twice,
                  * one readonly and one copy-on-write (see pg.  96, Sec 4.4 from
                  * Linkers and Loaders by John R.  Levine).  This makes the data
                  * section also satisfy the elf_header check above.  So, if the new
                  * mmap overlaps an elf_area and it is also a header, then make sure
                  * the previous page (correcting for alignment) is also a elf_header.
                  * Note, if it is a header of a different module, then we'll not have
                  * an overlap, so we will not hit this case.
                  */
                 ASSERT_CURIOSITY(ma->start + ma->os_data.alignment == base);
             }
         });
     }
     os_get_module_info_unlock();
     return ma != NULL;
 }

 /* All processing for mmap and mmap2. */
 static void
 process_mmap(dcontext_t *dcontext, app_pc base, size_t size, uint prot,
              uint flags _IF_DEBUG(const char *map_type))
 {
     bool image = false;
     uint memprot = osprot_to_memprot(prot);
 #ifdef CLIENT_INTERFACE
     bool inform_client = false;
 #endif

     LOG(THREAD, LOG_SYSCALLS, 4, "process_mmap("PFX","PFX",%s,%s)\n",
         base, size, memprot_string(memprot), map_type);
     /* Notes on how ELF SOs are mapped in.
      *
      * o The initial mmap for an ELF file specifies enough space for
      *   all segments (and their constituent sections) in the file.
      *   The protection bits for that section are used for the entire
      *   region, and subsequent mmaps for subsequent segments within
      *   the region modify their portion's protection bits as needed.
      *   So if the prot bits for the first segment are +x, the entire
      *   region is +x. ** Note that our primary concern is adjusting
      *   exec areas to reflect the prot bits of subsequent
      *   segments. ** The region is added to the all-memory areas
      *   and also to exec areas (as determined by app_memory_allocation()).
      *
      * o Any subsequent segment sub-mappings specify their own protection
      *   bits and therefore are added to the exec areas via normal
      *   processing. They are also "naturally" added to the all-mems list.
      *   We do a little extra processing when mapping into a previously
      *   mapped region and the prot bits mismatch; if the new mapping is
      *   not +x, flushing needs to occur.
      */
     /* process_mmap can be called with PROT_NONE, so we need to check if we
      * can read the memory to see if it is a elf_header
      */
     /* FIXME - get inode for check */
     if (TEST(MAP_ANONYMOUS, flags)) {
         /* not an ELF mmap */
     } else if (mmap_check_for_module_overlap(base, size,
                                              TEST(MEMPROT_READ, memprot), 0, true)) {
         /* FIXME - how can we distinguish between the loader mapping the segments
          * over the initial map from someone just mapping over part of a module? If
          * is the latter case need to adjust the view size or remove from module list. */
         image = true;
         DODEBUG({ map_type = "ELF SO"; });
     } else if (module_is_partial_map(base, size, memprot)) {
         /* i#1240: App might read first page of ELF header using mmap, which
          * might accidentally be treated as a module load. Heuristically
          * distinguish this by saying that if this is the first mmap for an ELF
          * (i.e., it doesn't overlap with a previous map), and if it's small,
          * then don't treat it as a module load.
          */
     } else if (TEST(MEMPROT_READ, memprot) &&
                /* i#727: We can still get SIGBUS on mmap'ed files that can't be
                 * read, so pass size=0 to use a safe_read.
                 */
                module_is_header(base, 0)) {
         memquery_iter_t iter;
         bool found_map = false;;
         uint64 inode = 0;
         const char *filename = "";
         LOG(THREAD, LOG_SYSCALLS|LOG_VMAREAS, 2, "dlopen "PFX"-"PFX"%s\n",
             base, base+size, TEST(MEMPROT_EXEC, memprot) ? " +x": "");
         image = true;
         DODEBUG({ map_type = "ELF SO"; });
         /* Mapping in a new module.  From what we've observed of the loader's
          * behavior, it first maps the file in with size equal to the final
          * memory image size (I'm not sure how it gets that size without reading
          * in the elf header and then walking through all the program headers to
          * get the largest virtual offset).  This is necessary to reserve all the
          * space that will be needed.  It then walks through the program headers
          * mapping over the the previously mapped space with the appropriate
          * permissions and offsets.  Note that the .bss portion is mapped over
          * as anonymous.  It may also, depending on the program headers, make some
          * areas read-only after fixing up their relocations etc. NOTE - at
          * no point are the section headers guaranteed to be mapped in so we can't
          * reliably walk sections (only segments) without looking to disk.
          */
         /* FIXME - when should we add the module to our list?  At the first map
          * seems to be the best choice as we know the bounds and it's difficult to
          * tell when the loader is finished.  The downside is that at the initial map
          * the memory layout isn't finalized (memory beyond the first segment will
          * be shifted for page alignment reasons), so we have to be careful and
          * make adjustments to read anything beyond the first segment until the
          * loader finishes. This goes for the client too as it gets notified when we
          * add to the list.  FIXME we could try to track the expected segment overmaps
          * and only notify the client after the last one (though that's still before
          * linking and relocation, but that's true on Windows too). */
         /* Get filename & inode for the list. */
         memquery_iterator_start(&iter, base, true /* plan to alloc a module_area_t */);
         while (memquery_iterator_next(&iter)) {
             if (iter.vm_start == base) {
                 ASSERT_CURIOSITY(iter.inode != 0);
                 ASSERT_CURIOSITY(iter.offset == 0); /* first map shouldn't have offset */
                 /* XREF 307599 on rounding module end to the next PAGE boundary */
                 ASSERT_CURIOSITY(iter.vm_end - iter.vm_start ==
                                  ALIGN_FORWARD(size, PAGE_SIZE));
                 inode = iter.inode;
                 filename = dr_strdup(iter.comment HEAPACCT(ACCT_OTHER));
                 found_map = true;
                 break;
             }
         }
         memquery_iterator_stop(&iter);
 #ifdef HAVE_MEMINFO
         ASSERT_CURIOSITY(found_map); /* barring weird races we should find this map */
 #else /* HAVE_MEMINFO */
         /* Without /proc/maps or other memory querying interface available at
          * library map time, there is no way to find out the name of the file
          * that was mapped, thus its inode isn't available either.
          *
          * Just module_list_add with no filename will still result in
          * library name being extracted from the .dynamic section and added
          * to the module list.  However, this name may not always exist, thus
          * we might have a library with no file name available at all!
          *
          * Note: visor implements vsi mem maps that give file info, but, no
          *       path, should be ok.  xref PR 401580.
          *
          * Once PR 235433 is implemented in visor then fix memquery_iterator*() to
          * use vsi to find out page protection info, file name & inode.
          */
 #endif /* HAVE_MEMINFO */
         /* XREF 307599 on rounding module end to the next PAGE boundary */
         module_list_add(base, ALIGN_FORWARD(size, PAGE_SIZE), true, filename, inode);
 #ifdef CLIENT_INTERFACE
         inform_client = true;
 #endif
         if (found_map)
             dr_strfree(filename HEAPACCT(ACCT_OTHER));
     }

     IF_NO_MEMQUERY(memcache_handle_mmap(dcontext, base, size, prot, image));

     /* app_memory_allocation() expects to not see an overlap -- exec areas
      * doesn't expect one.  We have yet to see a +x mmap into a previously
      * mapped +x region, but we do check and handle in pre-syscall (i#1175).
      */
     LOG(THREAD, LOG_SYSCALLS, 4, "\t try app_mem_alloc\n");
     if (app_memory_allocation(dcontext, base, size, memprot, image _IF_DEBUG(map_type)))
         STATS_INC(num_app_code_modules);
     LOG(THREAD, LOG_SYSCALLS, 4, "\t app_mem_alloc -- DONE\n");

 #ifdef CLIENT_INTERFACE
     /* invoke the client event only after DR's state is consistent */
     if (inform_client && dynamo_initialized)
         instrument_module_load_trigger(base);
 #endif
 }

 #ifdef LINUX
 /* Call right after the system call.
  * i#173: old_prot and old_type should be from before the system call
  */
 static bool
 handle_app_mremap(dcontext_t *dcontext, byte *base, size_t size,
                   byte *old_base, size_t old_size, uint old_prot, uint old_type)
 {
     if (!mmap_syscall_succeeded(base))
         return false;
     if (base != old_base || size < old_size) { /* take action only if
                                                 * there was a change */
         DEBUG_DECLARE(bool ok;)
         /* fragments were shifted...don't try to fix them, just flush */
         app_memory_deallocation(dcontext, (app_pc)old_base, old_size,
                                 false /* don't own thread_initexit_lock */,
                                 false /* not image, FIXME: somewhat arbitrary */);
         DOCHECK(1, {
             /* we don't expect to see remappings of modules */
             os_get_module_info_lock();
             ASSERT_CURIOSITY(!module_overlaps(base, size));
             os_get_module_info_unlock();
         });
         /* Verify that the current prot on the new region (according to
          * the os) is the same as what the prot used to be for the old
          * region.
          */
         DOCHECK(1, {
             uint memprot;
             ok = get_memory_info_from_os(base, NULL, NULL, &memprot);
             /* allow maps to have +x,
              * +x may be caused by READ_IMPLIES_EXEC set in personality flag (i#262)
              */
             ASSERT(ok && (memprot == old_prot ||
                           (memprot & (~MEMPROT_EXEC)) == old_prot));
         });
         app_memory_allocation(dcontext, base, size, old_prot,
                               old_type == DR_MEMTYPE_IMAGE
                               _IF_DEBUG("mremap"));
         IF_NO_MEMQUERY(memcache_handle_mremap(dcontext, base, size, old_base, old_size,
                                               old_prot, old_type));
     }
     return true;
 }

 static void
 handle_app_brk(dcontext_t *dcontext, byte *old_brk, byte *new_brk)
 {
     /* i#851: the brk might not be page aligned */
     old_brk = (app_pc) ALIGN_FORWARD(old_brk, PAGE_SIZE);
     new_brk = (app_pc) ALIGN_FORWARD(new_brk, PAGE_SIZE);
     if (new_brk < old_brk) {
         /* Usually the heap is writable, so we don't really need to call
          * this here: but seems safest to do so, esp if someone made part of
          * the heap read-only and then put code there.
          */
         app_memory_deallocation(dcontext, new_brk, old_brk - new_brk,
                                 false /* don't own thread_initexit_lock */,
                                 false /* not image */);
     } else if (new_brk > old_brk) {
         /* No need to call app_memory_allocation() as doesn't interact
          * w/ security policies.
          */
     }
     IF_NO_MEMQUERY(memcache_handle_app_brk(old_brk, new_brk));
 }
 #endif

 /* Returns false if system call should NOT be executed
  * Returns true if system call should go ahead
  */
 /* FIXME: split out specific handlers into separate routines
  */
 void
 post_system_call(dcontext_t *dcontext)
 {
     priv_mcontext_t *mc = get_mcontext(dcontext);
     /* registers have been clobbered, so sysnum is kept in dcontext */
     int sysnum = dcontext->sys_num;
     /* We expect most syscall failures to return < 0, so >= 0 is success.
      * Some syscall return addresses that have the sign bit set and so
      * appear to be failures but are not. They are handled on a
      * case-by-case basis in the switch statement below.
      */
     ptr_int_t result = (ptr_int_t) MCXT_SYSCALL_RES(mc); /* signed */
     bool success = syscall_successful(mc, sysnum);
     app_pc base;
     size_t size;
     uint prot;
     where_am_i_t old_whereami;
     DEBUG_DECLARE(bool ok;)

     RSTATS_INC(post_syscall);

     old_whereami = dcontext->whereami;
     dcontext->whereami = WHERE_SYSCALL_HANDLER;

 #if defined(LINUX) && defined(X86)
     /* PR 313715: restore xbp since for some vsyscall sequences that use
      * the syscall instruction its value is needed:
      *   0xffffe400 <__kernel_vsyscall+0>:       push   %ebp
      *   0xffffe401 <__kernel_vsyscall+1>:       mov    %ecx,%ebp
      *   0xffffe403 <__kernel_vsyscall+3>:       syscall
      *   0xffffe405 <__kernel_vsyscall+5>:       mov    $0x2b,%ecx
      *   0xffffe40a <__kernel_vsyscall+10>:      movl   %ecx,%ss
      *   0xffffe40c <__kernel_vsyscall+12>:      mov    %ebp,%ecx
      *   0xffffe40e <__kernel_vsyscall+14>:      pop    %ebp
      *   0xffffe40f <__kernel_vsyscall+15>:      ret
      */
     if (should_syscall_method_be_sysenter() && !dcontext->sys_was_int) {
         mc->xbp = dcontext->sys_xbp;
     }
 #endif

     /* handle fork, try to do it early before too much logging occurs */
     if (sysnum == SYS_fork
         IF_LINUX(|| (sysnum == SYS_clone && !TEST(CLONE_VM, dcontext->sys_param0)))) {
         if (result == 0) {
             /* we're the child */
             thread_id_t child = get_sys_thread_id();
 #ifdef DEBUG
             thread_id_t parent = get_parent_id();
             SYSLOG_INTERNAL_INFO("-- parent %d forked child %d --", parent, child);
 #endif
             /* first, fix TLS of dcontext */
             ASSERT(parent != 0);
             /* change parent pid to our pid */
             replace_thread_id(dcontext->owning_thread, child);
             dcontext->owning_thread = child;
             dcontext->owning_process = get_process_id();

             /* now let dynamo initialize new shared memory, logfiles, etc.
              * need access to static vars in dynamo.c, that's why we don't do it. */
             /* FIXME - xref PR 246902 - dispatch runs a lot of code before
              * getting to post_system_call() is any of that going to be messed up
              * by waiting till here to fixup the child logfolder/file and tid?
              */
             dynamorio_fork_init(dcontext);

             LOG(THREAD, LOG_SYSCALLS, 1,
                 "after fork-like syscall: parent is %d, child is %d\n", parent, child);
         } else {
             /* we're the parent */
             os_fork_post(dcontext, true/*parent*/);
         }
     }


     LOG(THREAD, LOG_SYSCALLS, 2,
         "post syscall: sysnum="PFX", result="PFX" (%d)\n",
         sysnum, MCXT_SYSCALL_RES(mc), (int)MCXT_SYSCALL_RES(mc));

     switch (sysnum) {

     /****************************************************************************/
     /* MEMORY REGIONS */

 #ifdef DEBUG
 # ifdef MACOS
     case SYS_open_nocancel:
 # endif
     case SYS_open: {
         if (success) {
             /* useful for figuring out what module was loaded that then triggers
              * module.c elf curiosities
              */
             LOG(THREAD, LOG_SYSCALLS, 2, "SYS_open %s => %d\n",
                 dcontext->sys_param0, (int)result);
         }
         break;
     }
 #endif

 #if defined(LINUX) && !defined(X64)
     case SYS_mmap2:
 #endif
     case SYS_mmap: {
         uint flags;
         DEBUG_DECLARE(const char *map_type;)
         RSTATS_INC(num_app_mmaps);
         base = (app_pc) MCXT_SYSCALL_RES(mc); /* For mmap, it's NOT arg->addr! */
         /* mmap isn't simply a user-space wrapper for mmap2. It's called
          * directly when dynamically loading an SO, i.e., dlopen(). */
 #ifdef LINUX /* MacOS success is in CF */
         success = mmap_syscall_succeeded((app_pc)result);
         /* The syscall either failed OR the retcode is less than the
          * largest uint value of any errno and the addr returned is
          * page-aligned.
          */
         ASSERT_CURIOSITY(!success ||
                          ((app_pc)result < (app_pc)(ptr_int_t)-0x1000 &&
                           ALIGNED(base, PAGE_SIZE)));
 #else
         ASSERT_CURIOSITY(!success || ALIGNED(base, PAGE_SIZE));
 #endif
         if (!success)
             goto exit_post_system_call;
 #if defined(LINUX) && !defined(X64)
         if (sysnum == SYS_mmap) {
             /* The syscall succeeded so the read of 'arg' should be
              * safe. */
             mmap_arg_struct_t *arg = (mmap_arg_struct_t *) dcontext->sys_param0;
             size = (size_t) arg->len;
             prot = (uint) arg->prot;
             flags = (uint) arg->flags;
             DEBUG_DECLARE(map_type = "mmap";)
         }
         else {
 #endif
             size = (size_t) dcontext->sys_param1;
             prot = (uint) dcontext->sys_param2;
             flags = (uint) dcontext->sys_param3;
             DEBUG_DECLARE(map_type = IF_X64_ELSE("mmap2","mmap");)
 #if defined(LINUX) && !defined(X64)
         }
 #endif
         process_mmap(dcontext, base, size, prot, flags _IF_DEBUG(map_type));
         break;
     }
     case SYS_munmap: {
         app_pc addr = (app_pc) dcontext->sys_param0;
         size_t len = (size_t) dcontext->sys_param1;
         /* We assumed in pre_system_call() that the unmap would succeed
          * and flushed fragments and removed the region from exec areas.
          * If the unmap failed, we re-add the region to exec areas.
          *
          * The same logic can be used on Windows (but isn't yet).
          */
         /* FIXME There are shortcomings to the approach. If another thread
          * executes in the region after our pre_system_call processing
          * but before the re-add below, it will get a security violation.
          * That's less than ideal but at least isn't a security hole.
          * The overall shortcoming is that we lose the state from our
          * stateful security policies -- future exec list, tables used
          * for RCT (.C/.E/.F) -- which can't be easily restored. Also,
          * the re-add could add a region that wasn't on the exec list
          * previously.
          *
          * See case 7559 for a better approach.
          */
         if (!success) {
             dr_mem_info_t info;
             /* must go to os to get real memory since we already removed */
             DEBUG_DECLARE(ok =)
                 query_memory_ex_from_os(addr, &info);
             ASSERT(ok);
             app_memory_allocation(dcontext, addr, len, info.prot,
                                   info.type == DR_MEMTYPE_IMAGE
                                   _IF_DEBUG("failed munmap"));
             IF_NO_MEMQUERY(memcache_update_locked(addr, addr + len, info.prot,
                                                   info.type, true/*exists*/));
         }
         break;
     }
 #ifdef LINUX
     case SYS_mremap: {
         app_pc old_base = (app_pc) dcontext->sys_param0;
         size_t old_size = (size_t) dcontext->sys_param1;
         base = (app_pc) MCXT_SYSCALL_RES(mc);
         size = (size_t) dcontext->sys_param2;
         /* even if no shift, count as munmap plus mmap */
         RSTATS_INC(num_app_munmaps);
         RSTATS_INC(num_app_mmaps);
         success = handle_app_mremap(dcontext, base, size, old_base, old_size,
                                     /* i#173: use memory prot and type
                                      * obtained from pre_system_call
                                      */
                                     (uint) dcontext->sys_param3,
                                     (uint) dcontext->sys_param4);
         /* The syscall either failed OR the retcode is less than the
          * largest uint value of any errno and the addr returned is
          * is page-aligned.
          */
         ASSERT_CURIOSITY(!success ||
                          ((app_pc)result < (app_pc)(ptr_int_t)-0x1000 &&
                           ALIGNED(base, PAGE_SIZE)));
         if (!success)
              goto exit_post_system_call;
         break;
     }
 #endif
     case SYS_mprotect: {
         base = (app_pc) dcontext->sys_param0;
         size = dcontext->sys_param1;
         prot = dcontext->sys_param2;
 #ifdef VMX86_SERVER
         /* PR 475111: workaround for PR 107872 */
         if (os_in_vmkernel_userworld() &&
             result == -EBUSY && prot == PROT_NONE) {
             result = mprotect_syscall(base, size, PROT_READ);
             /* since non-Mac, we can use this even if the call failed */
             set_success_return_val(dcontext, result);
             success = (result >= 0);
             LOG(THREAD, LOG_VMAREAS, 1,
                 "re-doing mprotect -EBUSY for "PFX"-"PFX" => %d\n",
                 base, base + size, (int)result);
             SYSLOG_INTERNAL_WARNING_ONCE("re-doing mprotect for PR 475111, PR 107872");
         }
 #endif
         /* FIXME i#143: we need to tweak the returned oldprot for
          * writable areas we've made read-only
          */
         if (!success) {
             uint memprot = 0;
             /* Revert the prot bits if needed. */
             if (!get_memory_info_from_os(base, NULL, NULL, &memprot))
                 memprot = PROT_NONE;
             LOG(THREAD, LOG_SYSCALLS, 3,
                 "syscall: mprotect failed: "PFX"-"PFX" prot->%d\n",
                 base, base+size, osprot_to_memprot(prot));
             LOG(THREAD, LOG_SYSCALLS, 3, "\told prot->%d\n", memprot);
             if (prot != memprot_to_osprot(memprot)) {
                 /* We're trying to reverse the prot change, assuming that
                  * this action doesn't have any unexpected side effects
                  * when doing so (such as not reversing some bit of internal
                  * state).
                  */
                 uint new_memprot;
                 DEBUG_DECLARE(uint res =)
                     app_memory_protection_change(dcontext, base, size,
                                                  osprot_to_memprot(prot),
                                                  &new_memprot,
                                                  NULL);
                 ASSERT_NOT_IMPLEMENTED(res != SUBSET_APP_MEM_PROT_CHANGE);
                 ASSERT(res == DO_APP_MEM_PROT_CHANGE ||
                        res == PRETEND_APP_MEM_PROT_CHANGE);

                 /* PR 410921 - Revert the changes to all-mems list.
                  * FIXME: This fix assumes the whole region had the prot &
                  * type, which is true in the cases we have seen so far, but
                  * theoretically may not be true.  If it isn't true, multiple
                  * memory areas with different types/protections might have
                  * been changed in pre_system_call(), so will have to keep a
                  * list of all vmareas changed.  This might be expensive for
                  * each mprotect syscall to guard against a rare theoretical bug.
                  */
                 ASSERT_CURIOSITY(!dcontext->mprot_multi_areas);
                 IF_NO_MEMQUERY(memcache_update_locked(base, base + size,
                                                       memprot, -1/*type unchanged*/,
                                                       true/*exists*/));
             }
         }
         break;
     }
 #ifdef LINUX
     case SYS_brk: {
         /* i#91/PR 396352: need to watch SYS_brk to maintain all_memory_areas.
          * This code should work regardless of whether syscall failed
          * (if it failed, the old break will be returned).  We stored
          * the old break in sys_param1 in pre-syscall.
          */
         app_pc old_brk = (app_pc) dcontext->sys_param1;
         app_pc new_brk = (app_pc) result;
         DEBUG_DECLARE(app_pc req_brk = (app_pc) dcontext->sys_param0;);
 # ifdef DEBUG
         if (DYNAMO_OPTION(early_inject) &&
             req_brk != NULL /* Ignore calls that don't increase brk. */) {
             DO_ONCE({
                 ASSERT_CURIOSITY(new_brk > old_brk && "i#1004: first brk() "
                                  "allocation failed with -early_inject");
             });
         }
 # endif
         handle_app_brk(dcontext, old_brk, new_brk);
         break;
     }
 #endif

     /****************************************************************************/
     /* SPAWNING -- fork mostly handled above */

 #ifdef LINUX
     case SYS_clone: {
         /* in /usr/src/linux/arch/i386/kernel/process.c */
         LOG(THREAD, LOG_SYSCALLS, 2, "syscall: clone returned "PFX"\n",
             MCXT_SYSCALL_RES(mc));
         /* We switch the lib tls segment back to dr's segment.
          * Please refer to comment on os_switch_lib_tls.
          * It is only called in parent thread.
          * The child thread's tls setup is done in os_tls_app_seg_init.
          */
         if (was_thread_create_syscall(dcontext) &&
             IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
             os_switch_lib_tls(dcontext, false/*to dr*/);
         }
         break;
     }
 #elif defined(MACOS) && !defined(X64)
     case SYS_bsdthread_create: {
         /* restore stack values we clobbered */
         ASSERT(*sys_param_addr(dcontext, 0) == (reg_t) new_bsdthread_intercept);
         *sys_param_addr(dcontext, 0) = dcontext->sys_param0;
         *sys_param_addr(dcontext, 1) = dcontext->sys_param1;
         break;
     }
 #endif

     case SYS_fork: {
         LOG(THREAD, LOG_SYSCALLS, 2, "syscall: fork returned "PFX"\n",
             MCXT_SYSCALL_RES(mc));
         break;
     }

     case SYS_vfork: {
         LOG(THREAD, LOG_SYSCALLS, 2, "syscall: vfork returned "PFX"\n",
             MCXT_SYSCALL_RES(mc));
         IF_LINUX(ASSERT(was_thread_create_syscall(dcontext)));
         /* restore xsp in parent */
         LOG(THREAD, LOG_SYSCALLS, 2,
             "vfork: restoring xsp from "PFX" to "PFX"\n",
             mc->xsp, dcontext->sys_param1);
         mc->xsp = dcontext->sys_param1;

         if (MCXT_SYSCALL_RES(mc) != 0) {
             /* We switch the lib tls segment back to dr's segment.
              * Please refer to comment on os_switch_lib_tls.
              * It is only called in parent thread.
              * The child thread's tls setup is done in os_tls_app_seg_init.
              */
             if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
                 os_switch_lib_tls(dcontext, false/*to dr*/);
             }
         }
         break;
     }

     case SYS_execve: {
         /* if we get here it means execve failed (doesn't return on success) */
         success = false;
         mark_thread_execve(dcontext->thread_record, false);
         ASSERT(result < 0);
         LOG(THREAD, LOG_SYSCALLS, 2, "syscall: execve failed\n");
         handle_execve_post(dcontext);
         /* Don't 'break' as we have an ASSERT(success) just below
          * the switch(). */
         goto exit_post_system_call;
         break; /* unnecessary but good form so keep it */
     }

     /****************************************************************************/
     /* SIGNALS */

     case IF_MACOS_ELSE(SYS_sigaction,SYS_rt_sigaction): {   /* 174 */
         /* in /usr/src/linux/kernel/signal.c:
            asmlinkage long
            sys_rt_sigaction(int sig, const struct sigaction *act,
              struct sigaction *oact, size_t sigsetsize)
          */
         /* FIXME i#148: Handle syscall failure. */
         int sig  = (int) dcontext->sys_param0;
         const kernel_sigaction_t *act =
             (const kernel_sigaction_t *) dcontext->sys_param1;
         kernel_sigaction_t *oact = (kernel_sigaction_t *) dcontext->sys_param2;
         size_t sigsetsize = (size_t) dcontext->sys_param3;
         if (!success)
             goto exit_post_system_call;

         handle_post_sigaction(dcontext, sig, act, oact, sigsetsize);
         break;
     }
     case IF_MACOS_ELSE(SYS_sigprocmask,SYS_rt_sigprocmask): { /* 175 */
         /* in /usr/src/linux/kernel/signal.c:
            asmlinkage long
            sys_rt_sigprocmask(int how, sigset_t *set, sigset_t *oset,
              size_t sigsetsize)
          */
         /* FIXME i#148: Handle syscall failure. */
         handle_post_sigprocmask(dcontext, (int) dcontext->sys_param0,
                                 (kernel_sigset_t *) dcontext->sys_param1,
                                 (kernel_sigset_t *) dcontext->sys_param2,
                                 (size_t) dcontext->sys_param3);
         break;
     }
 #if defined(LINUX) && !defined(X64)
     case SYS_sigreturn:        /* 119 */
 #endif
     case IF_MACOS_ELSE(SYS_sigreturn,SYS_rt_sigreturn):     /* 173 */
         /* there is no return value: it's just the value of eax, so avoid
          * assert below
          */
         success = true;
         break;

     case SYS_setitimer:       /* 104 */
         handle_post_setitimer(dcontext, success, (int) dcontext->sys_param0,
                               (const struct itimerval *) dcontext->sys_param1,
                               (struct itimerval *) dcontext->sys_param2);
         break;
     case SYS_getitimer:      /* 105 */
         handle_post_getitimer(dcontext, success, (int) dcontext->sys_param0,
                               (struct itimerval *) dcontext->sys_param1);
         break;
 #ifdef LINUX
     case SYS_alarm: /* 27 on x86 and 37 on x64 */
         handle_post_alarm(dcontext, success, (unsigned int) dcontext->sys_param0);
         break;
 #endif
 #if defined(LINUX) && defined(X64)
     case SYS_arch_prctl: {
         if (success && INTERNAL_OPTION(mangle_app_seg)) {
             tls_handle_post_arch_prctl(dcontext, dcontext->sys_param0,
                                        dcontext->sys_param1);
         }
         break;
     }
 #endif

     /****************************************************************************/
     /* FILES */

     case SYS_dup2:
     IF_LINUX(case SYS_dup3:) {
 #ifdef LINUX
         if (success)
             signal_handle_dup(dcontext, (file_t) sys_param(dcontext, 1), (file_t) result);
 #endif
         break;
     }

 #ifdef MACOS
     case SYS_fcntl_nocancel:
 #endif
     case SYS_fcntl: {
 #ifdef LINUX /* Linux-only since only for signalfd */
         if (success) {
             file_t fd = (long) dcontext->sys_param0;
             int cmd = (int) dcontext->sys_param1;
             if ((cmd == F_DUPFD || cmd == F_DUPFD_CLOEXEC))
                 signal_handle_dup(dcontext, fd, (file_t) result);
         }
         break;
 #endif
     }

     case SYS_getrlimit: {
         int resource = dcontext->sys_param0;
         if (success && resource == RLIMIT_NOFILE) {
             /* we stole some space: hide it from app */
             struct rlimit *rlim = (struct rlimit *) dcontext->sys_param1;
             safe_write_ex(&rlim->rlim_cur, sizeof(rlim->rlim_cur),
                           &app_rlimit_nofile.rlim_cur, NULL);
             safe_write_ex(&rlim->rlim_max, sizeof(rlim->rlim_max),
                           &app_rlimit_nofile.rlim_max, NULL);
         }
         break;
     }

 #ifdef VMX86_SERVER
     default:
         if (is_vmkuw_sysnum(sysnum)) {
             vmkuw_post_system_call(dcontext);
             break;
         }
 #endif

     } /* switch */

     DODEBUG({
         if (ignorable_system_call_normalized(sysnum)) {
             STATS_INC(post_syscall_ignorable);
         } else {
             /* Many syscalls can fail though they aren't ignored.  However, they
              * shouldn't happen without us knowing about them.  See PR 402769
              * for SYS_close case.
              */
             if (!(success || sysnum == SYS_close ||
                   IF_MACOS(sysnum == SYS_close_nocancel ||)
                   dcontext->expect_last_syscall_to_fail)) {
                 LOG(THREAD, LOG_SYSCALLS, 1,
                     "Unexpected failure of non-ignorable syscall %d\n", sysnum);
             }
         }
     });


  exit_post_system_call:

 #ifdef CLIENT_INTERFACE
     /* The instrument_post_syscall should be called after DR finishes all
      * its operations, since DR needs to know the real syscall results,
      * and any changes made by the client are simply to fool the app.
      * Also, dr_syscall_invoke_another() needs to set eax, which shouldn't
      * affect the result of the 1st syscall. Xref i#1.
      */
     /* after restore of xbp so client sees it as though was sysenter */
     instrument_post_syscall(dcontext, sysnum);
 #endif

     dcontext->whereami = old_whereami;
 }


 /* initializes dynamorio library bounds.
  * does not use any heap.
  * assumed to be called prior to find_executable_vm_areas.
  */
 static int
 get_dynamo_library_bounds(void)
 {
     /* Note that we're not counting DYNAMORIO_PRELOAD_NAME as a DR area, to match
      * Windows, so we should unload it like we do there. The other reason not to
      * count it is so is_in_dynamo_dll() can be the only exception to the
      * never-execute-from-DR-areas list rule
      */
     int res;
     app_pc check_start, check_end;
     char *libdir;
     const char *dynamorio_libname;
 #ifdef STATIC_LIBRARY
     /* We don't know our image name, so look up our bounds with an internal
      * address.
      */
     dynamorio_libname = NULL;
     check_start = (app_pc)&get_dynamo_library_bounds;
 #else /* !STATIC_LIBRARY */
 #  ifdef LINUX
     /* PR 361594: we get our bounds from linker-provided symbols.
      * Note that referencing the value of these symbols will crash:
      * always use the address only.
      */
     extern int dynamorio_so_start, dynamorio_so_end;
     dynamo_dll_start = (app_pc) &dynamorio_so_start;
     dynamo_dll_end = (app_pc) ALIGN_FORWARD(&dynamorio_so_end, PAGE_SIZE);
 #  elif defined(MACOS)
     dynamo_dll_start = module_dynamorio_lib_base();
 #  endif
     check_start = dynamo_dll_start;
     dynamorio_libname = IF_UNIT_TEST_ELSE(UNIT_TEST_EXE_NAME,DYNAMORIO_LIBRARY_NAME);
 #endif /* STATIC_LIBRARY */
     res = memquery_library_bounds(dynamorio_libname,
                                   &check_start, &check_end,
                                   dynamorio_library_path,
                                   BUFFER_SIZE_ELEMENTS(dynamorio_library_path));
     LOG(GLOBAL, LOG_VMAREAS, 1, PRODUCT_NAME" library path: %s\n",
         dynamorio_library_path);
 #if !defined(STATIC_LIBRARY) && defined(LINUX)
     ASSERT(check_start == dynamo_dll_start && check_end == dynamo_dll_end);
 #elif defined(MACOS)
     ASSERT(check_start == dynamo_dll_start);
     dynamo_dll_end   = check_end;
 #else
     dynamo_dll_start = check_start;
     dynamo_dll_end   = check_end;
 #endif
     LOG(GLOBAL, LOG_VMAREAS, 1, "DR library bounds: "PFX" to "PFX"\n",
         dynamo_dll_start, dynamo_dll_end);
     ASSERT(res > 0);

     /* Issue 20: we need the path to the alt arch */
     strncpy(dynamorio_alt_arch_path, dynamorio_library_path,
             BUFFER_SIZE_ELEMENTS(dynamorio_alt_arch_path));
     /* Assumption: libdir name is not repeated elsewhere in path */
     libdir = strstr(dynamorio_alt_arch_path, IF_X64_ELSE(DR_LIBDIR_X64, DR_LIBDIR_X86));
     if (libdir != NULL) {
         const char *newdir = IF_X64_ELSE(DR_LIBDIR_X86, DR_LIBDIR_X64);
         /* do NOT place the NULL */
         strncpy(libdir, newdir, strlen(newdir));
     } else {
         SYSLOG_INTERNAL_WARNING("unable to determine lib path for cross-arch execve");
     }
     NULL_TERMINATE_BUFFER(dynamorio_alt_arch_path);
     LOG(GLOBAL, LOG_VMAREAS, 1, PRODUCT_NAME" alt arch path: %s\n",
         dynamorio_alt_arch_path);

     return res;
 }

 /* get full path to our own library, (cached), used for forking and message file name */
 char*
 get_dynamorio_library_path(void)
 {
     if (!dynamorio_library_path[0]) { /* not cached */
         get_dynamo_library_bounds();
     }
     return dynamorio_library_path;
 }

 #ifdef LINUX
 /* Get full path+name of executable file from /proc/self/exe.  Returns an empty
  * string on error.
  * FIXME i#47: This will return DR's path when using early injection.
  */
 static char *
 read_proc_self_exe(bool ignore_cache)
 {
     static char exepath[MAXIMUM_PATH];
     static bool tried = false;
 # ifdef MACOS
     ASSERT_NOT_IMPLEMENTED(false);
 # endif
     if (!tried || ignore_cache) {
         tried = true;
         /* assume we have /proc/self/exe symlink: could add HAVE_PROC_EXE
          * but we have no alternative solution except assuming the first
          * /proc/self/maps entry is the executable
          */
         ssize_t res;
         DEBUG_DECLARE(int len = )
             snprintf(exepath, BUFFER_SIZE_ELEMENTS(exepath),
                      "/proc/%d/exe", get_process_id());
         ASSERT(len > 0);
         NULL_TERMINATE_BUFFER(exepath);
         /* i#960: readlink does not null terminate, so we do it. */
         res = dynamorio_syscall(SYS_readlink, 3, exepath, exepath,
                                 BUFFER_SIZE_ELEMENTS(exepath)-1);
         ASSERT(res < BUFFER_SIZE_ELEMENTS(exepath));
         exepath[MAX(res, 0)] = '\0';
         NULL_TERMINATE_BUFFER(exepath);
     }
     return exepath;
 }
 #endif /* LINUX */

 app_pc
 get_application_base(void)
 {
     if (executable_start == NULL) {
 #ifdef HAVE_MEMINFO
         /* Haven't done find_executable_vm_areas() yet so walk maps ourselves */
         const char *name = get_application_name();
         if (name != NULL && name[0] != '\0') {
             memquery_iter_t iter;
             memquery_iterator_start(&iter, NULL, false/*won't alloc*/);
             while (memquery_iterator_next(&iter)) {
                 if (strcmp(iter.comment, name) == 0) {
                     executable_start = iter.vm_start;
                     executable_end = iter.vm_end;
                     break;
                 }
             }
             memquery_iterator_stop(&iter);
         }
 #else
         /* We have to fail.  Should we dl_iterate this early? */
 #endif
     }
     return executable_start;
 }

 app_pc
 get_application_end(void)
 {
     if (executable_end == NULL)
         get_application_base();
     return executable_end;
 }

 app_pc
 get_image_entry()
 {
     static app_pc image_entry_point = NULL;
     if (image_entry_point == NULL && executable_start != NULL) {
         module_area_t *ma;
         os_get_module_info_lock();
         ma = module_pc_lookup(executable_start);
         ASSERT(ma != NULL);
         if (ma != NULL) {
             ASSERT(executable_start == ma->start);
             SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
             image_entry_point = ma->entry_point;
             SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
         }
         os_get_module_info_unlock();
     }
     return image_entry_point;
 }

 #ifdef DEBUG
 void
 mem_stats_snapshot()
 {
     /* FIXME: NYI */
 }
 #endif

 bool
 is_in_dynamo_dll(app_pc pc)
 {
     ASSERT(dynamo_dll_start != NULL);
 #ifdef VMX86_SERVER
     /* We want to consider vmklib as part of the DR lib for allowing
      * execution (_init calls os_in_vmkernel_classic()) and for
      * reporting crashes as our fault
      */
     if (vmk_in_vmklib(pc))
         return true;
 #endif
     return (pc >= dynamo_dll_start && pc < dynamo_dll_end);
 }

 app_pc
 get_dynamorio_dll_start()
 {
     if (dynamo_dll_start == NULL)
         get_dynamo_library_bounds();
     ASSERT(dynamo_dll_start != NULL);
     return dynamo_dll_start;
 }

 app_pc
 get_dynamorio_dll_end()
 {
     if (dynamo_dll_end == NULL)
         get_dynamo_library_bounds();
     ASSERT(dynamo_dll_end != NULL);
     return dynamo_dll_end;
 }

 app_pc
 get_dynamorio_dll_preferred_base()
 {
     /* on Linux there is no preferred base if we're PIC,
      * therefore is always equal to dynamo_dll_start  */
     return get_dynamorio_dll_start();
 }

 /* assumed to be called after find_dynamo_library_vm_areas() */
 int
 find_executable_vm_areas(void)
 {
     int count = 0;
 #ifdef MACOS
     app_pc shared_start, shared_end;
     bool have_shared = module_dyld_shared_region(&shared_start, &shared_end);
 #endif

 #ifndef HAVE_MEMINFO_QUERY
     /* We avoid tracking the innards of vmheap for all_memory_areas by
      * adding a single no-access region for the whole vmheap.
      * Queries from heap routines use _from_os.
      * Queries in check_thread_vm_area are fine getting "noaccess": wants
      * any DR memory not on exec areas list to be noaccess.
      * Queries from clients: should be ok to hide innards.  Marking noaccess
      * should be safer than marking free, as unruly client might try to mmap
      * something in the free space: better to have it think it's reserved but
      * not yet used memory.  FIXME: we're not marking beyond-vmheap DR regions
      * as noaccess!
      */
     byte *our_heap_start, *our_heap_end;
     get_vmm_heap_bounds(&our_heap_start, &our_heap_end);
     if (our_heap_end - our_heap_start > 0) {
         memcache_update_locked(our_heap_start, our_heap_end, MEMPROT_NONE,
                                DR_MEMTYPE_DATA, false/*!exists*/);
     }
 #endif

 #ifndef HAVE_MEMINFO
     count = find_vm_areas_via_probe();
 #else
     memquery_iter_t iter;
     memquery_iterator_start(&iter, NULL, true/*may alloc*/);
     while (memquery_iterator_next(&iter)) {
         bool image = false;
         size_t size = iter.vm_end - iter.vm_start;
         /* i#479, hide private module and match Windows's behavior */
         bool skip = dynamo_vm_area_overlap(iter.vm_start, iter.vm_end) &&
             !is_in_dynamo_dll(iter.vm_start) /* our own text section is ok */
             /* client lib text section is ok (xref i#487) */
             IF_CLIENT_INTERFACE(&& !is_in_client_lib(iter.vm_start));
         DEBUG_DECLARE(const char *map_type = "Private");
         /* we can't really tell what's a stack and what's not, but we rely on
          * our passing NULL preventing rwx regions from being added to executable
          * or future list, even w/ -executable_if_alloc
          */

         LOG(GLOBAL, LOG_VMAREAS, 2,
             "start="PFX" end="PFX" prot=%x comment=%s\n",
             iter.vm_start, iter.vm_end, iter.prot, iter.comment);
         /* Issue 89: the vdso might be loaded inside ld.so as below,
          * which causes ASSERT_CURIOSITY in mmap_check_for_module_overlap fail.
          * b7fa3000-b7fbd000 r-xp 00000000 08:01 108679     /lib/ld-2.8.90.so
          * b7fbd000-b7fbe000 r-xp b7fbd000 00:00 0          [vdso]
          * b7fbe000-b7fbf000 r--p 0001a000 08:01 108679     /lib/ld-2.8.90.so
          * b7fbf000-b7fc0000 rw-p 0001b000 08:01 108679     /lib/ld-2.8.90.so
          * So we always first check if it is a vdso page before calling
          * mmap_check_for_module_overlap.
          * Update: with i#160/PR 562667 handling non-contiguous modules like
          * ld.so we now gracefully handle other objects like vdso in gaps in
          * module, but it's simpler to leave this ordering here.
          */
         if (skip) {
             /* i#479, hide private module and match Windows's behavior */
             LOG(GLOBAL, LOG_VMAREAS, 2, PFX"-"PFX" skipping: internal DR region\n",
                 iter.vm_start, iter.vm_end);
 #ifdef MACOS
         } else if (have_shared && iter.vm_start >= shared_start &&
                    iter.vm_start < shared_end) {
             /* Skip modules we happen to find inside the dyld shared cache,
              * as we'll fail to identify the library.  We add them
              * in module_walk_dyld_list instead.
              */
             image = true;
 #endif
         } else if (strncmp(iter.comment, VSYSCALL_PAGE_MAPS_NAME,
                     strlen(VSYSCALL_PAGE_MAPS_NAME)) == 0
             IF_X64_ELSE(|| strncmp(iter.comment, VSYSCALL_REGION_MAPS_NAME,
                                    strlen(VSYSCALL_REGION_MAPS_NAME)) == 0,
             /* Older kernels do not label it as "[vdso]", but it is hardcoded there */
             /* 32-bit */
                         || iter.vm_start == VSYSCALL_PAGE_START_HARDCODED)) {
 # ifndef X64
             /* We assume no vsyscall page for x64; thus, checking the
              * hardcoded address shouldn't have any false positives.
              */
             ASSERT(iter.vm_end - iter.vm_start == PAGE_SIZE);
             ASSERT(!dynamo_initialized); /* .data should be +w */
             ASSERT(vsyscall_page_start == NULL);
             /* we're not considering as "image" even if part of ld.so (xref i#89) and
              * thus we aren't adjusting our code origins policies to remove the
              * vsyscall page exemption.
              */
             DODEBUG({ map_type = "VDSO"; });
             vsyscall_page_start = iter.vm_start;
             LOG(GLOBAL, LOG_VMAREAS, 1, "found vsyscall page @ "PFX" %s\n",
                 vsyscall_page_start, iter.comment);
 # else
             /* i#172
              * fix bugs for OS where vdso page is set unreadable as below
              * ffffffffff600000-ffffffffffe00000 ---p 00000000 00:00 0 [vdso]
              * but it is readable indeed.
              */
             /* i#430
              * fix bugs for OS where vdso page is set unreadable as below
              * ffffffffff600000-ffffffffffe00000 ---p 00000000 00:00 0 [vsyscall]
              * but it is readable indeed.
              */
             if (!TESTALL((PROT_READ|PROT_EXEC), iter.prot))
                 iter.prot |= (PROT_READ|PROT_EXEC);
 # endif
         } else if (mmap_check_for_module_overlap(iter.vm_start, size,
                                                  TEST(MEMPROT_READ, iter.prot),
                                                  iter.inode, false)) {
             /* we already added the whole image region when we hit the first map for it */
             image = true;
             DODEBUG({ map_type = "ELF SO"; });
         } else if (TEST(MEMPROT_READ, iter.prot) &&
                    module_is_header(iter.vm_start, size)) {
             size_t image_size = size;
             app_pc mod_base, mod_first_end, mod_max_end;
             char *exec_match;
             bool found_exec = false;
             image = true;
             DODEBUG({ map_type = "ELF SO"; });
             LOG(GLOBAL, LOG_VMAREAS, 2,
                 "Found already mapped module first segment :\n"
                 "\t"PFX"-"PFX"%s inode="UINT64_FORMAT_STRING" name=%s\n",
                 iter.vm_start, iter.vm_end, TEST(MEMPROT_EXEC, iter.prot) ? " +x": "",
                 iter.inode, iter.comment);
 #ifdef LINUX
             ASSERT_CURIOSITY(iter.inode != 0); /* mapped images should have inodes */
 #endif
             ASSERT_CURIOSITY(iter.offset == 0); /* first map shouldn't have offset */
             /* Get size by walking the program headers.  This includes .bss. */
             if (module_walk_program_headers(iter.vm_start, size, false,
                                             &mod_base, &mod_first_end,
                                             &mod_max_end, NULL, NULL)) {
                 image_size = mod_max_end - mod_base;
             } else {
                 ASSERT_NOT_REACHED();
             }
             LOG(GLOBAL, LOG_VMAREAS, 2,
                 "Found already mapped module total module :\n"
                 "\t"PFX"-"PFX" inode="UINT64_FORMAT_STRING" name=%s\n",
                 iter.vm_start, iter.vm_start+image_size, iter.inode, iter.comment);

             /* look for executable */
 #ifdef LINUX
             exec_match = get_application_name();
             if (exec_match != NULL && exec_match[0] != '\0')
                 found_exec = (strcmp(iter.comment, exec_match) == 0);
 #else
             /* We don't have a nice normalized name: it can have ./ or ../ inside
              * it.  But, we can distinguish an exe from a lib here, even for PIE,
              * so we go with that plus a basename comparison.
              */
             exec_match = (char *) get_application_short_name();
             if (module_is_executable(iter.vm_start) &&
                 exec_match != NULL && exec_match[0] != '\0') {
                 const char *iter_basename = strrchr(iter.comment, '/');
                 if (iter_basename == NULL)
                     iter_basename = iter.comment;
                 else
                     iter_basename++;
                 found_exec = (strcmp(iter_basename, exec_match) == 0);
             }
 #endif
             if (found_exec) {
                 if (executable_start == NULL)
                     executable_start = iter.vm_start;
                 else
                     ASSERT(iter.vm_start == executable_start);
                 LOG(GLOBAL, LOG_VMAREAS, 2,
                     "Found executable %s @"PFX"-"PFX" %s\n", get_application_name(),
                     iter.vm_start, iter.vm_start+image_size, iter.comment);
             }
             /* We don't yet know whether contiguous so we have to settle for the
              * first segment's size.  We'll update it in module_list_add().
              */
             module_list_add(iter.vm_start, mod_first_end - mod_base,
                             false, iter.comment, iter.inode);

 #ifdef MACOS
             /* look for dyld */
             if (strcmp(iter.comment, "/usr/lib/dyld") == 0)
                 module_walk_dyld_list(iter.vm_start);
 #endif
         } else if (iter.inode != 0) {
             DODEBUG({ map_type = "Mapped File"; });
         }

         /* add all regions (incl. dynamo_areas and stack) to all_memory_areas */
         LOG(GLOBAL, LOG_VMAREAS, 4,
             "find_executable_vm_areas: adding: "PFX"-"PFX" prot=%d\n",
             iter.vm_start, iter.vm_end, iter.prot);
         IF_NO_MEMQUERY(memcache_update_locked(iter.vm_start, iter.vm_end, iter.prot,
                                               image ? DR_MEMTYPE_IMAGE :
                                               DR_MEMTYPE_DATA, false/*!exists*/));

         /* FIXME: best if we could pass every region to vmareas, but
          * it has no way of determining if this is a stack b/c we don't have
          * a dcontext at this point -- so we just don't pass the stack
          */
         if (!skip /* i#479, hide private module and match Windows's behavior */ &&
             app_memory_allocation(NULL, iter.vm_start, (iter.vm_end - iter.vm_start),
                                   iter.prot, image _IF_DEBUG(map_type))) {
             count++;
         }

     }
     memquery_iterator_stop(&iter);
 #endif /* !HAVE_MEMINFO */

 #ifndef HAVE_MEMINFO_QUERY
     DOLOG(4, LOG_VMAREAS, memcache_print(GLOBAL,"init: all memory areas:\n"););
 #endif

     /* now that we've walked memory print all modules */
     LOG(GLOBAL, LOG_VMAREAS, 2, "Module list after memory walk\n");
     DOLOG(1, LOG_VMAREAS, { print_modules(GLOBAL, DUMP_NOT_XML); });

     STATS_ADD(num_app_code_modules, count);

     /* now that we have the modules set up, query libc */
     get_libc_errno_location(true/*force init*/);

     return count;
 }

 /* initializes dynamorio library bounds.
  * does not use any heap.
  * assumed to be called prior to find_executable_vm_areas.
  */
 int
 find_dynamo_library_vm_areas(void)
 {
 #ifndef STATIC_LIBRARY
     /* We didn't add inside get_dynamo_library_bounds b/c it was called pre-alloc.
      * We don't bother to break down the sub-regions.
      * Assumption: we don't need to have the protection flags for DR sub-regions.
      * For static library builds, DR's code is in the exe and isn't considered
      * to be a DR area.
      */
     add_dynamo_vm_area(get_dynamorio_dll_start(), get_dynamorio_dll_end(),
                        MEMPROT_READ|MEMPROT_WRITE|MEMPROT_EXEC,
                        true /* from image */ _IF_DEBUG(dynamorio_library_path));
 #endif
 #ifdef VMX86_SERVER
     if (os_in_vmkernel_userworld())
         vmk_add_vmklib_to_dynamo_areas();
 #endif
     return 1;
 }

 bool
 get_stack_bounds(dcontext_t *dcontext, byte **base, byte **top)
 {
     os_thread_data_t *ostd = (os_thread_data_t *) dcontext->os_field;
     if (ostd->stack_base == NULL) {
         /* initialize on-demand since don't have app esp handy in os_thread_init()
          * FIXME: the comment here -- ignoring it for now, if hit cases confirming
          * it the right thing will be to merge adjacent rwx regions and assume
          * their union is the stack -- otherwise have to have special stack init
          * routine called from x86.asm new_thread_dynamo_start and internal_dynamo_start,
          * and the latter is not a do-once...
          */
          size_t size = 0;
          bool ok;
          /* store stack info at thread startup, since stack can get fragmented in
           * /proc/self/maps w/ later mprotects and it can be hard to piece together later
           */
          if (IF_MEMQUERY_ELSE(false, DYNAMO_OPTION(use_all_memory_areas))) {
              ok = get_memory_info((app_pc)get_mcontext(dcontext)->xsp,
                                   &ostd->stack_base, &size, NULL);
          } else {
              ok = get_memory_info_from_os((app_pc)get_mcontext(dcontext)->xsp,
                                           &ostd->stack_base, &size, NULL);
          }
          ASSERT(ok);
          ostd->stack_top = ostd->stack_base + size;
          LOG(THREAD, LOG_THREADS, 1, "App stack is "PFX"-"PFX"\n",
              ostd->stack_base, ostd->stack_top);
     }
     if (base != NULL)
         *base = ostd->stack_base;
     if (top != NULL)
         *top = ostd->stack_top;
     return true;
 }

 #ifdef RETURN_AFTER_CALL
 initial_call_stack_status_t
 at_initial_stack_bottom(dcontext_t *dcontext, app_pc target_pc)
 {
     /* We can't rely exclusively on finding the true stack bottom
      * b/c we can't always walk the call stack (PR 608990) so we
      * use the image entry as our primary trigger
      */
     if (executable_start != NULL/*defensive*/ && reached_image_entry_yet()) {
         return INITIAL_STACK_EMPTY;
     } else {
         /* If our stack walk ends early we could have false positives, but
          * that's better than false negatives if we miss the image entry
          * or we were unable to find the executable_start
          */
         os_thread_data_t *ostd = (os_thread_data_t *) dcontext->os_field;
         if (target_pc == ostd->stack_bottom_pc) {
             return INITIAL_STACK_BOTTOM_REACHED;
         } else {
             return INITIAL_STACK_BOTTOM_NOT_REACHED;
         }
     }
 }
 #endif /* RETURN_AFTER_CALL */

 /* Uses our cached data structures (if in use, else raw query) to retrieve memory info */
 bool
 query_memory_ex(const byte *pc, OUT dr_mem_info_t *out_info)
 {
 #ifdef HAVE_MEMINFO_QUERY
     return query_memory_ex_from_os(pc, out_info);
 #else
     return memcache_query_memory(pc, out_info);
 #endif
 }

 bool
 query_memory_cur_base(const byte *pc, OUT dr_mem_info_t *info)
 {
     return query_memory_ex(pc, info);
 }

 /* Use our cached data structures (if in use, else raw query) to retrieve memory info */
 bool
 get_memory_info(const byte *pc, byte **base_pc, size_t *size,
                 uint *prot /* OUT optional, returns MEMPROT_* value */)
 {
     dr_mem_info_t info;
 #ifdef CLIENT_INTERFACE
     if (is_vmm_reserved_address((byte*)pc, 1)) {
         if (!query_memory_ex_from_os(pc, &info) || info.type == DR_MEMTYPE_FREE)
             return false;
     } else
 #endif
         if (!query_memory_ex(pc, &info) || info.type == DR_MEMTYPE_FREE)
             return false;
     if (base_pc != NULL)
         *base_pc = info.base_pc;
     if (size != NULL)
         *size = info.size;
     if (prot != NULL)
         *prot = info.prot;
     return true;
 }

 /* We assume that this routine might be called instead of query_memory_ex()
  * b/c the caller is in a fragile location and cannot acquire locks, so
  * we try to do the same here.
  */
 bool
 query_memory_ex_from_os(const byte *pc, OUT dr_mem_info_t *info)
 {
     bool have_type = false;
     bool res = memquery_from_os(pc, info, &have_type);
     if (!res) {
         /* No other failure types for now */
         info->type = DR_MEMTYPE_ERROR;
     } else if (res && !have_type) {
         /* We pass 0 instead of info->size b/c even if marked as +r we can still
          * get SIGBUS if beyond end of mmapped file: not uncommon if querying
          * in middle of library load before .bss fully set up (PR 528744).
          * However, if there is no fault handler, is_elf_so_header's safe_read will
          * recurse to here, so in that case we use info->size but we assume
          * it's only at init or exit and so not in the middle of a load
          * and less likely to be querying a random mmapped file.
          * The cleaner fix is to allow safe_read to work w/o a dcontext or
          * fault handling: i#350/PR 529066.
          */
         if (TEST(MEMPROT_READ, info->prot) &&
             module_is_header(info->base_pc, fault_handling_initialized ? 0 : info->size))
             info->type = DR_MEMTYPE_IMAGE;
         else {
             /* FIXME: won't quite match find_executable_vm_areas marking as
              * image: can be doubly-mapped so; don't want to count vdso; etc.
              */
             info->type = DR_MEMTYPE_DATA;
         }
     }
     return res;
 }

 bool
 get_memory_info_from_os(const byte *pc, byte **base_pc, size_t *size,
                         uint *prot /* OUT optional, returns MEMPROT_* value */)
 {
     dr_mem_info_t info;
     if (!query_memory_ex_from_os(pc, &info) || info.type == DR_MEMTYPE_FREE)
         return false;
     if (base_pc != NULL)
         *base_pc = info.base_pc;
     if (size != NULL)
         *size = info.size;
     if (prot != NULL)
         *prot = info.prot;
     return true;
 }


 /* in utils.c, exported only for our hack! */
 extern void deadlock_avoidance_unlock(mutex_t *lock, bool ownable);

 void
 mutex_wait_contended_lock(mutex_t *lock)
 {
 #ifdef CLIENT_INTERFACE
     dcontext_t *dcontext = get_thread_private_dcontext();
     bool set_client_safe_for_synch =
                       ((dcontext != NULL) && IS_CLIENT_THREAD(dcontext) &&
                         ((mutex_t *)dcontext->client_data->client_grab_mutex == lock));
 #endif

     /* i#96/PR 295561: use futex(2) if available */
     if (ksynch_kernel_support()) {
         /* Try to get the lock.  If already held, it's fine to store any value
          * > LOCK_SET_STATE (we don't rely on paired incs/decs) so that
          * the next unlocker will call mutex_notify_released_lock().
          */
         ptr_int_t res;
 #ifndef LINUX /* we actually don't use this for Linux: see below */
         KSYNCH_TYPE *event = mutex_get_contended_event(lock);
         ASSERT(event != NULL && ksynch_var_initialized(event));
 #endif
         while (atomic_exchange_int(&lock->lock_requests, LOCK_CONTENDED_STATE) !=
                LOCK_FREE_STATE) {
 #ifdef CLIENT_INTERFACE
             if (set_client_safe_for_synch)
                 dcontext->client_data->client_thread_safe_for_synch = true;
 #endif
             /* Unfortunately the synch semantics are different for Linux vs Mac.
              * We have to use lock_requests as the futex to avoid waiting if
              * lock_requests changes, while on Mac the underlying synch prevents
              * a wait there.
              */
 #ifdef LINUX
             /* We'll abort the wait if lock_requests has changed at all.
              * We can't have a series of changes that result in no apparent
              * change w/o someone acquiring the lock, b/c
              * mutex_notify_released_lock() sets lock_requests to LOCK_FREE_STATE.
              */
             res = ksynch_wait(&lock->lock_requests, LOCK_CONTENDED_STATE);
 #else
             res = ksynch_wait(event, 0);
 #endif
             if (res != 0 && res != -EWOULDBLOCK)
                 os_thread_yield();
 #ifdef CLIENT_INTERFACE
             if (set_client_safe_for_synch)
                 dcontext->client_data->client_thread_safe_for_synch = false;
 #endif
             /* we don't care whether properly woken (res==0), var mismatch
              * (res==-EWOULDBLOCK), or error: regardless, someone else
              * could have acquired the lock, so we try again
              */
         }
     } else {
         /* we now have to undo our earlier request */
         atomic_dec_and_test(&lock->lock_requests);

         while (!mutex_trylock(lock)) {
 #ifdef CLIENT_INTERFACE
             if (set_client_safe_for_synch)
                 dcontext->client_data->client_thread_safe_for_synch = true;
 #endif
             os_thread_yield();
 #ifdef CLIENT_INTERFACE
             if (set_client_safe_for_synch)
                 dcontext->client_data->client_thread_safe_for_synch = false;
 #endif
         }

 #ifdef DEADLOCK_AVOIDANCE
         /* HACK: trylock's success causes it to do DEADLOCK_AVOIDANCE_LOCK, so to
          * avoid two in a row (causes assertion on owner) we unlock here
          * In the future we will remove the trylock here and this will go away.
          */
         deadlock_avoidance_unlock(lock, true);
 #endif
     }

     return;

 }

 void
 mutex_notify_released_lock(mutex_t *lock)
 {
     /* i#96/PR 295561: use futex(2) if available. */
     if (ksynch_kernel_support()) {
         /* Set to LOCK_FREE_STATE to avoid concurrent lock attempts from
          * resulting in a futex_wait value match w/o anyone owning the lock
          */
         lock->lock_requests = LOCK_FREE_STATE;
         /* No reason to wake multiple threads: just one */
 #ifdef LINUX
         ksynch_wake(&lock->lock_requests);
 #else
         ksynch_wake(&lock->contended_event);
 #endif
     } /* else nothing to do */
 }

 /* read_write_lock_t implementation doesn't expect the contention path
    helpers to guarantee the lock is held (unlike mutexes) so simple
    yields are still acceptable.
 */
 void
 rwlock_wait_contended_writer(read_write_lock_t *rwlock)
 {
     os_thread_yield();
 }

 void
 rwlock_notify_writer(read_write_lock_t *rwlock)
 {
     /* nothing to do here */
 }

 void
 rwlock_wait_contended_reader(read_write_lock_t *rwlock)
 {
     os_thread_yield();
 }

 void
 rwlock_notify_readers(read_write_lock_t *rwlock)
 {
     /* nothing to do here */
 }

 /***************************************************************************/

 /* events are un-signaled when successfully waited upon. */
 typedef struct linux_event_t {
     /* Any function that sets this flag must also notify possibly waiting
      * thread(s). See i#96/PR 295561.
      */
     KSYNCH_TYPE signaled;
     mutex_t lock;
 } linux_event_t;


 /* FIXME: this routine will need to have a macro wrapper to let us assign different ranks to
  * all events for DEADLOCK_AVOIDANCE.  Currently a single rank seems to work.
  */
 event_t
 create_event()
 {
     event_t e = (event_t) global_heap_alloc(sizeof(linux_event_t) HEAPACCT(ACCT_OTHER));
     ksynch_init_var(&e->signaled);
     ASSIGN_INIT_LOCK_FREE(e->lock, event_lock); /* FIXME: we'll need to pass the event name here */
     return e;
 }

 void
 destroy_event(event_t e)
 {
     DELETE_LOCK(e->lock);
     ksynch_free_var(&e->signaled);
     global_heap_free(e, sizeof(linux_event_t) HEAPACCT(ACCT_OTHER));
 }

 void
 signal_event(event_t e)
 {
     mutex_lock(&e->lock);
     ksynch_set_value(&e->signaled, 1);
     ksynch_wake(&e->signaled);
     LOG(THREAD_GET, LOG_THREADS, 3,"thread "TIDFMT" signalling event "PFX"\n",get_thread_id(),e);
     mutex_unlock(&e->lock);
 }

 void
 reset_event(event_t e)
 {
     mutex_lock(&e->lock);
     ksynch_set_value(&e->signaled, 0);
     LOG(THREAD_GET, LOG_THREADS, 3,"thread "TIDFMT" resetting event "PFX"\n",get_thread_id(),e);
     mutex_unlock(&e->lock);
 }

 void
 wait_for_event(event_t e)
 {
 #ifdef DEBUG
     dcontext_t *dcontext = get_thread_private_dcontext();
 #endif
     /* Use a user-space event on Linux, a kernel event on Windows. */
     LOG(THREAD, LOG_THREADS, 3, "thread "TIDFMT" waiting for event "PFX"\n",get_thread_id(),e);
     while (true) {
         if (ksynch_get_value(&e->signaled) == 1) {
             mutex_lock(&e->lock);
             if (ksynch_get_value(&e->signaled) == 0) {
                 /* some other thread beat us to it */
                 LOG(THREAD, LOG_THREADS, 3, "thread "TIDFMT" was beaten to event "PFX"\n",
                     get_thread_id(),e);
                 mutex_unlock(&e->lock);
             } else {
                 /* reset the event */
                 ksynch_set_value(&e->signaled, 0);
                 mutex_unlock(&e->lock);
                 LOG(THREAD, LOG_THREADS, 3,
                     "thread "TIDFMT" finished waiting for event "PFX"\n", get_thread_id(),e);
                 return;
             }
         } else {
             /* Waits only if the signaled flag is not set as 1. Return value
              * doesn't matter because the flag will be re-checked.
              */
             ksynch_wait(&e->signaled, 0);
         }
         if (ksynch_get_value(&e->signaled) == 0) {
             /* If it still has to wait, give up the cpu. */
             os_thread_yield();
         }
     }
 }

 /***************************************************************************
  * DIRECTORY ITERATOR
  */

 /* These structs are written to the buf that we pass to getdents.  We can
  * iterate them by adding d_reclen to the current buffer offset and interpreting
  * that as the next entry.
  */
 struct linux_dirent {
     long           d_ino;       /* Inode number. */
     off_t          d_off;       /* Offset to next linux_dirent. */
     unsigned short d_reclen;    /* Length of this linux_dirent. */
     char           d_name[];    /* File name, null-terminated. */
 #if 0  /* Has to be #if 0 because it's after the flexible array. */
     char           d_pad;       /* Always zero? */
     char           d_type;      /* File type, since Linux 2.6.4. */
 #endif
 };

 #define CURRENT_DIRENT(iter) \
         ((struct linux_dirent *)(&iter->buf[iter->off]))

 static void
 os_dir_iterator_start(dir_iterator_t *iter, file_t fd)
 {
     iter->fd = fd;
     iter->off = 0;
     iter->end = 0;
 }

 static bool
 os_dir_iterator_next(dir_iterator_t *iter)
 {
 #ifdef MACOS
     /* We can use SYS_getdirentries, but do we even need a dir iterator?
      * On Linux it's only used to enumerate /proc/pid/task.
      */
     ASSERT_NOT_IMPLEMENTED(false);
     return false;
 #else
     if (iter->off < iter->end) {
         /* Have existing dents, get the next offset. */
         iter->off += CURRENT_DIRENT(iter)->d_reclen;
         ASSERT(iter->off <= iter->end);
     }
     if (iter->off == iter->end) {
         /* Do a getdents syscall.  Unlike when reading a file, the kernel will
          * not read a partial linux_dirent struct, so we don't need to shift the
          * left over bytes to the buffer start.  See the getdents manpage for
          * the example code that this is based on.
          */
         iter->off = 0;
         iter->end = dynamorio_syscall(SYS_getdents, 3, iter->fd, iter->buf,
                                       sizeof(iter->buf));
         ASSERT(iter->end <= sizeof(iter->buf));
         if (iter->end <= 0) {  /* No more dents, or error. */
             iter->name = NULL;
             if (iter->end < 0) {
                 LOG(GLOBAL, LOG_SYSCALLS, 1,
                     "getdents syscall failed with errno %d\n", -iter->end);
             }
             return false;
         }
     }
     iter->name = CURRENT_DIRENT(iter)->d_name;
     return true;
 #endif
 }

 /***************************************************************************
  * THREAD TAKEOVER
  */

 /* Record used to synchronize thread takeover. */
 typedef struct _takeover_record_t {
     thread_id_t tid;
     event_t event;
 } takeover_record_t;

 /* When attempting thread takeover, we store an array of thread id and event
  * pairs here.  Each thread we signal is supposed to enter DR control and signal
  * this event after it has added itself to all_threads.
  *
  * XXX: What we really want is to be able to use SYS_rt_tgsigqueueinfo (Linux >=
  * 2.6.31) to pass the event_t to each thread directly, rather than using this
  * side data structure.
  */
 static takeover_record_t *thread_takeover_records;
 static uint num_thread_takeover_records;

 /* This is the dcontext of the thread that initiated the takeover.  We read the
  * owning_thread and signal_field threads from it in the signaled threads to
  * set up siginfo sharing.
  */
 static dcontext_t *takeover_dcontext;

 /* Lists active threads in the process.
  * XXX: The /proc man page says /proc/pid/task is only available if the main
  * thread is still alive, but experiments on 2.6.38 show otherwise.
  */
 static thread_id_t *
 os_list_threads(dcontext_t *dcontext, uint *num_threads_out)
 {
     dir_iterator_t iter;
     file_t task_dir;
     uint tids_alloced = 10;
     uint num_threads = 0;
     thread_id_t *new_tids;
     thread_id_t *tids;

     ASSERT(num_threads_out != NULL);

 #ifdef MACOS
     /* XXX i#58: NYI.
      * We may want SYS_proc_info with PROC_INFO_PID_INFO and PROC_PIDLISTTHREADS,
      * or is that just BSD threads and instead we want process_set_tasks()
      * and task_info() as in 7.3.1.3 in Singh's OSX book?
      */
     *num_threads_out = 0;
     return NULL;
 #endif

     tids = HEAP_ARRAY_ALLOC(dcontext, thread_id_t, tids_alloced,
                             ACCT_THREAD_MGT, PROTECTED);
     task_dir = os_open_directory("/proc/self/task", OS_OPEN_READ);
     ASSERT(task_dir != INVALID_FILE);
     os_dir_iterator_start(&iter, task_dir);
     while (os_dir_iterator_next(&iter)) {
         thread_id_t tid;
         DEBUG_DECLARE(int r;)
         if (strcmp(iter.name, ".") == 0 ||
             strcmp(iter.name, "..") == 0)
             continue;
         IF_DEBUG(r =)
             sscanf(iter.name, "%u", &tid);
         ASSERT_MESSAGE(CHKLVL_ASSERTS, "failed to parse /proc/pid/task entry",
                        r == 1);
         if (tid <= 0)
             continue;
         if (num_threads == tids_alloced) {
             /* realloc, essentially.  Less expensive than counting first. */
             new_tids = HEAP_ARRAY_ALLOC(dcontext, thread_id_t, tids_alloced * 2,
                                         ACCT_THREAD_MGT, PROTECTED);
             memcpy(new_tids, tids, sizeof(thread_id_t) * tids_alloced);
             HEAP_ARRAY_FREE(dcontext, tids, thread_id_t, tids_alloced,
                             ACCT_THREAD_MGT, PROTECTED);
             tids = new_tids;
             tids_alloced *= 2;
         }
         tids[num_threads++] = tid;
     }
     ASSERT(iter.end == 0);  /* No reading errors. */
     os_close(task_dir);

     /* realloc back down to num_threads for caller simplicity. */
     new_tids = HEAP_ARRAY_ALLOC(dcontext, thread_id_t, num_threads,
                                 ACCT_THREAD_MGT, PROTECTED);
     memcpy(new_tids, tids, sizeof(thread_id_t) * num_threads);
     HEAP_ARRAY_FREE(dcontext, tids, thread_id_t, tids_alloced,
                     ACCT_THREAD_MGT, PROTECTED);
     tids = new_tids;
     *num_threads_out = num_threads;
     return tids;
 }

 /* List the /proc/self/task directory and add all unknown thread ids to the
  * all_threads hashtable in dynamo.c.  Returns true if we found any unknown
  * threads and false otherwise.  We assume that since we don't know about them
  * they are not under DR and have no dcontexts.
  */
 bool
 os_take_over_all_unknown_threads(dcontext_t *dcontext)
 {
     uint i;
     uint num_threads;
     thread_id_t *tids;
     uint threads_to_signal = 0;

     mutex_lock(&thread_initexit_lock);
     CLIENT_ASSERT(thread_takeover_records == NULL,
                   "Only one thread should attempt app take over!");

     /* Find tids for which we have no thread record, meaning they are not under
      * our control.  Shift them to the beginning of the tids array.
      */
     tids = os_list_threads(dcontext, &num_threads);
     if (tids == NULL) {
         mutex_unlock(&thread_initexit_lock);
         return false; /* have to assume no unknown */
     }
     for (i = 0; i < num_threads; i++) {
         thread_record_t *tr = thread_lookup(tids[i]);
         if (tr == NULL)
             tids[threads_to_signal++] = tids[i];
     }

     if (threads_to_signal > 0) {
         takeover_record_t *records;

         /* Assuming pthreads, prepare signal_field for sharing. */
         handle_clone(dcontext, PTHREAD_CLONE_FLAGS);

         /* Create records with events for all the threads we want to signal. */
         LOG(GLOBAL, LOG_THREADS, 1,
             "TAKEOVER: publishing takeover records\n");
         records = HEAP_ARRAY_ALLOC(dcontext, takeover_record_t,
                                    threads_to_signal, ACCT_THREAD_MGT,
                                    PROTECTED);
         for (i = 0; i < threads_to_signal; i++) {
             LOG(GLOBAL, LOG_THREADS, 1,
                 "TAKEOVER: will signal thread "TIDFMT"\n", tids[i]);
             records[i].tid = tids[i];
             records[i].event = create_event();
         }

         /* Publish the records and the initial take over dcontext. */
         thread_takeover_records = records;
         num_thread_takeover_records = threads_to_signal;
         takeover_dcontext = dcontext;

         /* Signal the other threads. */
         for (i = 0; i < threads_to_signal; i++) {
             thread_signal(get_process_id(), records[i].tid, SUSPEND_SIGNAL);
         }
         mutex_unlock(&thread_initexit_lock);

         /* Wait for all the threads we signaled. */
         ASSERT_OWN_NO_LOCKS();
         for (i = 0; i < threads_to_signal; i++) {
             wait_for_event(records[i].event);
         }

         /* Now that we've taken over the other threads, we can safely free the
          * records and reset the shared globals.
          */
         mutex_lock(&thread_initexit_lock);
         LOG(GLOBAL, LOG_THREADS, 1,
             "TAKEOVER: takeover complete, unpublishing records\n");
         thread_takeover_records = NULL;
         num_thread_takeover_records = 0;
         takeover_dcontext = NULL;
         for (i = 0; i < threads_to_signal; i++) {
             destroy_event(records[i].event);
         }
         HEAP_ARRAY_FREE(dcontext, records, takeover_record_t,
                         threads_to_signal, ACCT_THREAD_MGT, PROTECTED);
     }

     mutex_unlock(&thread_initexit_lock);
     HEAP_ARRAY_FREE(dcontext, tids, thread_id_t, num_threads,
                     ACCT_THREAD_MGT, PROTECTED);

     return threads_to_signal > 0;
 }

 /* Takes over the current thread from the signal handler.  We notify the thread
  * that signaled us by signalling our event in thread_takeover_records.
  */
 void
 os_thread_take_over(priv_mcontext_t *mc)
 {
     int r;
     uint i;
     thread_id_t mytid;
     dcontext_t *dcontext;
     priv_mcontext_t *dc_mc;
     event_t event = NULL;

     LOG(GLOBAL, LOG_THREADS, 1,
         "TAKEOVER: received signal in thread "TIDFMT"\n", get_sys_thread_id());

     /* Do standard DR thread initialization.  Mirrors code in
      * create_clone_record and new_thread_setup, except we're not putting a
      * clone record on the dstack.
      */
     r = dynamo_thread_init(NULL, mc _IF_CLIENT_INTERFACE(false));
     ASSERT(r == SUCCESS);
     dcontext = get_thread_private_dcontext();
     ASSERT(dcontext != NULL);
     share_siginfo_after_take_over(dcontext, takeover_dcontext);
     dynamo_thread_under_dynamo(dcontext);
     dc_mc = get_mcontext(dcontext);
     *dc_mc = *mc;
     dcontext->whereami = WHERE_APP;
     dcontext->next_tag = mc->pc;

     /* Wake up the thread that initiated the take over. */
     mytid = get_thread_id();
     ASSERT(thread_takeover_records != NULL);
     for (i = 0; i < num_thread_takeover_records; i++) {
         if (thread_takeover_records[i].tid == mytid) {
             event = thread_takeover_records[i].event;
             break;
         }
     }
     ASSERT_MESSAGE(CHKLVL_ASSERTS, "mytid not present in takeover records!",
                    event != NULL);
     signal_event(event);

     DOLOG(2, LOG_TOP, {
         byte *cur_esp;
         GET_STACK_PTR(cur_esp);
         LOG(THREAD, LOG_TOP, 2, "%s: next_tag="PFX", cur xsp="PFX", mc->xsp="PFX"\n",
             __FUNCTION__, dcontext->next_tag, cur_esp, mc->xsp);
     });

     /* Start interpreting from the signal context. */
     call_switch_stack(dcontext, dcontext->dstack, dispatch,
                       NULL/*not on initstack*/, false/*shouldn't return*/);
     ASSERT_NOT_REACHED();
 }

 bool
 os_thread_take_over_suspended_native(dcontext_t *dcontext)
 {
     os_thread_data_t *ostd = (os_thread_data_t *) dcontext->os_field;
     if (!is_thread_currently_native(dcontext->thread_record) ||
         ksynch_get_value(&ostd->suspended) < 0)
         return false;
     /* Thread is sitting in suspend signal loop so we just set a flag
      * for when it resumes:
      */
     /* XXX: there's no event for a client to trigger this on so not yet
      * tested.  i#721 may help.
      */
     ASSERT_NOT_TESTED();
     ostd->retakeover = true;
     return true;
 }

 /***************************************************************************/

 uint
 os_random_seed(void)
 {
     uint seed;
     /* reading from /dev/urandom for a non-blocking random */
     int urand = os_open("/dev/urandom", OS_OPEN_READ);
     DEBUG_DECLARE(int read = )os_read(urand, &seed, sizeof(seed));
     ASSERT(read == sizeof(seed));
     os_close(urand);

     return seed;
 }

 #ifdef RCT_IND_BRANCH
 /* Analyze a range in a possibly new module
  * return false if not a code section in a module
  * otherwise returns true and adds all valid targets for rct_ind_branch_check
  */
 bool
 rct_analyze_module_at_violation(dcontext_t *dcontext, app_pc target_pc)
 {
     /* FIXME: note that this will NOT find the data section corresponding to the given PC
      * we don't yet have a corresponding get_allocation_size or an ELF header walk routine
      * on linux
      */
     app_pc code_start;
     size_t code_size;
     uint prot;

     if (!get_memory_info(target_pc, &code_start, &code_size, &prot))
         return false;
     /* TODO: in almost all cases expect the region at module_base+module_size to be
      * the corresponding data section.
      * Writable yet initialized data indeed needs to be processed.
      */

     if (code_size > 0) {
         app_pc code_end = code_start + code_size;

         app_pc data_start;
         size_t data_size;

         ASSERT(TESTALL(MEMPROT_READ|MEMPROT_EXEC, prot)); /* code */

         if (!get_memory_info(code_end, &data_start, &data_size, &prot))
             return false;

         ASSERT(data_start == code_end);
         ASSERT(TESTALL(MEMPROT_READ|MEMPROT_WRITE, prot)); /* data */

         app_pc text_start = code_start;
         app_pc text_end = data_start + data_size;

         /* TODO: performance: should do this only in case relocation info is not present */
         DEBUG_DECLARE(uint found = )
             find_address_references(dcontext, text_start, text_end,
                                     code_start, code_end);
         LOG(GLOBAL, LOG_RCT, 2, PFX"-"PFX" : %d ind targets of %d code size",
             text_start, text_end,
             found, code_size);
         return true;
     }
     return false;
 }

 #ifdef X64
 bool
 rct_add_rip_rel_addr(dcontext_t *dcontext, app_pc tgt _IF_DEBUG(app_pc src))
 {
     /* FIXME PR 276762: not implemented */
     return false;
 }
 #endif
 #endif /* RCT_IND_BRANCH */

 #ifdef HOT_PATCHING_INTERFACE
 void*
 get_drmarker_hotp_policy_status_table()
 {
     ASSERT_NOT_IMPLEMENTED(false);
     return NULL;
 }

 void
 set_drmarker_hotp_policy_status_table(void *new_table)
 {
     ASSERT_NOT_IMPLEMENTED(false);
 }

 byte *
 hook_text(byte *hook_code_buf, const app_pc image_addr,
           intercept_function_t hook_func, const void *callee_arg,
           const after_intercept_action_t action_after,
           const bool abort_if_hooked, const bool ignore_cti,
           byte **app_code_copy_p, byte **alt_exit_tgt_p)
 {
     ASSERT_NOT_IMPLEMENTED(false);
     return NULL;
 }

 void
 unhook_text(byte *hook_code_buf, app_pc image_addr)
 {
     ASSERT_NOT_IMPLEMENTED(false);
 }

 void
 insert_jmp_at_tramp_entry(byte *trampoline, byte *target)
 {
     ASSERT_NOT_IMPLEMENTED(false);
 }
 #endif  /* HOT_PATCHING_INTERFACE */

 bool
 aslr_is_possible_attack(app_pc target)
 {
     /* FIXME: ASLR not implemented */
     return false;
 }

 app_pc
 aslr_possible_preferred_address(app_pc target_addr)
 {
     /* FIXME: ASLR not implemented */
     return NULL;
 }

 void
 take_over_primary_thread()
 {
     /* nothing to do here */
 }

 bool
 os_current_user_directory(char *directory_prefix /* INOUT */,
                           uint directory_len,
                           bool create)
 {
     /* XXX: could share some of this code w/ corresponding windows routine */
     uid_t uid = dynamorio_syscall(SYS_getuid, 0);
     char *directory = directory_prefix;
     char *dirend = directory_prefix + strlen(directory_prefix);
     snprintf(dirend, directory_len - (dirend - directory_prefix), "%cdpc-%d",
              DIRSEP, uid);
     directory_prefix[directory_len - 1] = '\0';
     if (!os_file_exists(directory, true/*is dir*/) && create) {
         /* XXX: we should ensure we do not follow symlinks */
         /* XXX: should add support for CREATE_DIR_FORCE_OWNER */
         if (!os_create_dir(directory, CREATE_DIR_REQUIRE_NEW)) {
             LOG(GLOBAL, LOG_CACHE, 2,
                 "\terror creating per-user dir %s\n", directory);
             return false;
         } else {
             LOG(GLOBAL, LOG_CACHE, 2,
                 "\tcreated per-user dir %s\n", directory);
         }
     }
     return true;
 }

 bool
 os_validate_user_owned(file_t file_or_directory_handle)
 {
     /* note on Linux this scheme should never be used */
     ASSERT(false && "chown Alice evilfile");
     return false;
 }

 bool
 os_check_option_compatibility(void)
 {
     /* no options are Linux OS version dependent */
     return false;
 }

 #ifndef X64
 /* Emulate uint64 modulo and division by uint32 on ia32.
  * XXX: Does *not* handle 64-bit divisors!
  */
 static uint64
 uint64_divmod(uint64 dividend, uint64 divisor64, uint32 *remainder)
 {
 # ifdef X86
     /* Assumes little endian, which x86 is. */
     union {
         uint64 v64;
         struct {
             uint32 lo;
             uint32 hi;
         };
     } res;
     uint32 upper;
     uint32 divisor = (uint32) divisor64;

     /* Our uses don't use large divisors. */
     ASSERT(divisor64 <= UINT_MAX && "divisor is larger than uint32 can hold");

     /* Divide out the high bits first. */
     res.v64 = dividend;
     upper = res.hi;
     res.hi = upper / divisor;
     upper %= divisor;

     /* Use the unsigned div instruction, which uses EDX:EAX to form a 64-bit
      * dividend.  We only get a 32-bit quotient out, which is why we divide out
      * the high bits first.  The quotient will fit in EAX.
      *
      * DIV r/m32    F7 /6  Unsigned divide EDX:EAX by r/m32, with result stored
      *                     in EAX <- Quotient, EDX <- Remainder.
      * inputs:
      *   EAX = res.lo
      *   EDX = upper
      *   rm = divisor
      * outputs:
      *   res.lo = EAX
      *   *remainder = EDX
      * The outputs precede the inputs in gcc inline asm syntax, and so to put
      * inputs in EAX and EDX we use "0" and "1".
      */
     asm ("divl %2" : "=a" (res.lo), "=d" (*remainder) :
          "rm" (divisor), "0" (res.lo), "1" (upper));
     return res.v64;
 # elif defined(ARM)
     ASSERT_NOT_IMPLEMENTED(false);
     return 0;
 # endif /* X86/ARM */
 }

 /* Match libgcc's prototype. */
 uint64
 __udivdi3(uint64 dividend, uint64 divisor)
 {
     uint32 remainder;
     return uint64_divmod(dividend, divisor, &remainder);
 }

 /* Match libgcc's prototype. */
 uint64
 __umoddi3(uint64 dividend, uint64 divisor)
 {
     uint32 remainder;
     uint64_divmod(dividend, divisor, &remainder);
     return (uint64) remainder;
 }
 #endif /* !X64 */

 #endif /* !NOT_DYNAMORIO_CORE_PROPER: around most of file, to exclude preload */

 #if defined(STANDALONE_UNIT_TEST)

 void
 test_uint64_divmod(void)
 {
 #ifndef X64
     uint64 quotient;
     uint32 remainder;

     /* Simple division below 2^32. */
     quotient = uint64_divmod(9, 3, &remainder);
     EXPECT(quotient == 3, true);
     EXPECT(remainder == 0, true);
     quotient = uint64_divmod(10, 3, &remainder);
     EXPECT(quotient == 3, true);
     EXPECT(remainder == 1, true);

     /* Division when upper bits are less than the divisor. */
     quotient = uint64_divmod(45ULL << 31, 1U << 31, &remainder);
     EXPECT(quotient == 45, true);
     EXPECT(remainder == 0, true);

     /* Division when upper bits are greater than the divisor. */
     quotient = uint64_divmod(45ULL << 32, 15, &remainder);
     EXPECT(quotient == 3ULL << 32, true);
     EXPECT(remainder == 0, true);
     quotient = uint64_divmod((45ULL << 32) + 13, 15, &remainder);
     EXPECT(quotient == 3ULL << 32, true);
     EXPECT(remainder == 13, true);

     /* Try calling the intrinsics.  Don't divide by powers of two, gcc will
      * lower that to a shift.
      */
     quotient = (45ULL << 32);
     quotient /= 15;
     EXPECT(quotient == (3ULL << 32), true);
     quotient = (45ULL << 32) + 13;
     remainder = quotient % 15;
     EXPECT(remainder == 13, true);
 #endif /* !X64 */
 }

 void
 unit_test_os(void)
 {
     test_uint64_divmod();
 }

 #endif /* STANDALONE_UNIT_TEST */