DRi#241: use new kernel xfer events (#2066)

Adds use of DR's new kernel xfer events, eliminating the need to wrap the
Ki routines and watch several system calls.  Removes those events from the
alloc.h interface in favor of local kernel xfer events in the user.
Removes app signal handler tracking which is no longer needed.

Updates DR to fa465385.  Relaxes an assert triggered by DR's ea84da0
regarding handling displaced vsyscall code
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4a02bba..89d0542 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -807,7 +807,7 @@
 endif ()
 
 # when updating this, also update the git submodule
-set(DynamoRIO_VERSION_REQUIRED "6.1.17042")
+set(DynamoRIO_VERSION_REQUIRED "6.2.17499")
 
 set(DR_install_dir "dynamorio")
 
diff --git a/common/alloc.c b/common/alloc.c
index c6af356..1611a36 100644
--- a/common/alloc.c
+++ b/common/alloc.c
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2010-2016 Google, Inc.  All rights reserved.
+ * Copyright (c) 2010-2017 Google, Inc.  All rights reserved.
  * Copyright (c) 2008-2010 VMware, Inc.  All rights reserved.
  * **********************************************************/
 
@@ -96,7 +96,6 @@
 static int sysnum_munmap = -1;
 static int sysnum_valloc = -1;
 static int sysnum_vfree = -1;
-static int sysnum_cbret = -1;
 int sysnum_continue = -1;
 int sysnum_setcontext = -1;
 int sysnum_RaiseException = -1;
@@ -148,20 +147,12 @@
 
 #ifdef WINDOWS
 static void
-alloc_wrap_exception(void *wrapcxt, void OUT **user_data);
+alloc_handle_exception(void *drcontext);
 
 static void
-alloc_wrap_Ki(void *wrapcxt, void OUT **user_data);
+alloc_handle_continue(void *drcontext);
 #endif
 
-static dr_emit_flags_t
-alloc_event_bb_analysis(void *drcontext, void *tag, instrlist_t *bb,
-                        bool for_trace, bool translating, OUT void **user_data);
-
-static dr_emit_flags_t
-alloc_event_bb_insert(void *drcontext, void *tag, instrlist_t *bb, instr_t *inst,
-                      bool for_trace, bool translating, void *user_data);
-
 static bool
 malloc_lock_held_by_self(void);
 
@@ -3050,6 +3041,17 @@
     info->client_data = e->data;
 }
 
+void
+alloc_kernel_xfer(void *drcontext, const dr_kernel_xfer_info_t *info)
+{
+#ifdef WINDOWS
+    if (info->type == DR_XFER_EXCEPTION_DISPATCHER)
+        alloc_handle_exception(drcontext);
+    else if (info->type == DR_XFER_CONTINUE)
+        alloc_handle_continue(drcontext);
+#endif
+}
+
 /* If track_allocs is false, only callbacks and callback returns are tracked.
  * Else: if track_heap is false, only syscall allocs are tracked;
  *       else, syscall allocs and mallocs are tracked.
@@ -3057,15 +3059,6 @@
 void
 alloc_init(alloc_options_t *ops, size_t ops_size)
 {
-    drmgr_priority_t pri_insert = {sizeof(pri_insert), "drmemory.alloc.insert",
-                                   /* must go before CLS exit and after CLS entry */
-                                   DRMGR_PRIORITY_NAME_CLS_EXIT,
-                                   DRMGR_PRIORITY_NAME_CLS_ENTRY,
-                                   DRMGR_PRIORITY_INSERT_ALLOC};
-    if (!drmgr_register_bb_instrumentation_event(alloc_event_bb_analysis,
-                                                 alloc_event_bb_insert, &pri_insert))
-        ASSERT(false, "drmgr registration failed");
-
     ASSERT(ops_size <= sizeof(alloc_ops), "option struct too large");
     memcpy(&alloc_ops, ops, ops_size);
     ASSERT(alloc_ops.track_allocs || !alloc_ops.track_heap,
@@ -3118,8 +3111,15 @@
     }
 #endif
 
-    if (!alloc_ops.track_allocs)
+    if (!alloc_ops.track_allocs) {
         return;
+    }
+
+    /* We want alloc_handle_continue to be late so users can call is_in_seh(). */
+    drmgr_priority_t pri_xfer = {sizeof(pri_xfer), "drmemory.alloc.xfer",
+                                 NULL, NULL, 500};
+    if (!drmgr_register_kernel_xfer_event_ex(alloc_kernel_xfer, &pri_xfer))
+        ASSERT(false, "xfer event registration failed");
 
     /* set up the per-malloc API */
     if (alloc_ops.replace_malloc)
@@ -3137,6 +3137,8 @@
         dr_mutex_destroy(alloc_routine_lock);
     }
 
+    drmgr_unregister_kernel_xfer_event(alloc_kernel_xfer);
+
     if (!alloc_ops.track_allocs)
         return;
 
@@ -3197,38 +3199,8 @@
         return;
 
     if (stri_eq(modname, "ntdll.dll")) {
-        app_pc addr_KiCallback = (app_pc)
-            dr_get_proc_address(info->handle, "KiUserCallbackDispatcher");
-        ASSERT(addr_KiCallback != NULL, "can't find Ki routine");
-        if (!drwrap_wrap_ex(addr_KiCallback, alloc_wrap_Ki, NULL, (void*)1, 0))
-            ASSERT(false, "failed to wrap");
         if (alloc_ops.track_allocs) {
-            app_pc addr_KiAPC, addr_KiLdrThunk, addr_KiException, addr_KiRaise;
-            addr_KiAPC = (app_pc) dr_get_proc_address(info->handle,
-                                                      "KiUserApcDispatcher");
-            addr_KiLdrThunk = (app_pc) dr_get_proc_address(info->handle,
-                                                           "LdrInitializeThunk");
-            addr_KiException = (app_pc) dr_get_proc_address(info->handle,
-                                                            "KiUserExceptionDispatcher");
-            addr_KiRaise = (app_pc) dr_get_proc_address(info->handle,
-                                                        "KiRaiseUserExceptionDispatcher");
-            /* Assuming that KiUserCallbackExceptionHandler,
-             * KiUserApcExceptionHandler, and the Ki*SystemCall* routines are not
-             * entered from the kernel.
-             */
-            ASSERT(addr_KiAPC != NULL && addr_KiLdrThunk != NULL &&
-                   addr_KiException != NULL && addr_KiRaise != NULL,
-                   "can't find Ki routine");
-            if (!drwrap_wrap_ex(addr_KiAPC, alloc_wrap_Ki, NULL, (void*)0, 0) ||
-                !drwrap_wrap(addr_KiException, alloc_wrap_exception, NULL) ||
-                !drwrap_wrap_ex(addr_KiRaise, alloc_wrap_Ki, NULL, (void*)0, 0))
-                ASSERT(false, "failed to wrap");
-            /* we should ignore LdrInitializeThunk on pre-Vista since we'll get
-             * there via APC: only on Vista+ is it a "Ki" routine
-             */
-            if (running_on_Vista_or_later() &&
-                !drwrap_wrap_ex(addr_KiLdrThunk, alloc_wrap_Ki, NULL, (void*)0, 0))
-                ASSERT(false, "failed to wrap");
+            /* We no longer need to wrap Ki routines: we use event_kernel_xfer now. */
 
             /* FIXME i#1153: watch NtWow64AllocateVirtualMemory64 on win8 */
             get_primary_sysnum("NtMapViewOfSection", &sysnum_mmap, false);
@@ -3236,7 +3208,6 @@
             get_primary_sysnum("NtAllocateVirtualMemory", &sysnum_valloc, false);
             get_primary_sysnum("NtFreeVirtualMemory", &sysnum_vfree, false);
             get_primary_sysnum("NtContinue", &sysnum_continue, false);
-            get_primary_sysnum("NtCallbackReturn", &sysnum_cbret, false);
             get_primary_sysnum("NtSetContextThread", &sysnum_setcontext, false);
             get_primary_sysnum("NtMapCMFModule", &sysnum_mapcmf,
                                !running_on_Win7_or_later());
@@ -4400,15 +4371,6 @@
 /*
  ***************************************************************************/
 
-#ifdef WINDOWS
-static void
-handle_cbret(bool syscall)
-{
-    void *drcontext = dr_get_current_drcontext();
-    client_handle_cbret(drcontext);
-}
-#endif
-
 bool
 alloc_syscall_filter(void *drcontext, int sysnum)
 {
@@ -4418,7 +4380,7 @@
 #ifdef WINDOWS
     if (sysnum == sysnum_mmap || sysnum == sysnum_munmap ||
         sysnum == sysnum_valloc || sysnum == sysnum_vfree ||
-        sysnum == sysnum_cbret || sysnum == sysnum_continue ||
+        sysnum == sysnum_continue ||
         sysnum == sysnum_RaiseException ||
         sysnum == sysnum_setcontext || sysnum == sysnum_mapcmf ||
         sysnum == sysnum_UserConnectToServer ||
@@ -4456,7 +4418,7 @@
 #ifdef WINDOWS
     if (sysnum == sysnum_mmap || sysnum == sysnum_munmap ||
         sysnum == sysnum_valloc || sysnum == sysnum_vfree ||
-        sysnum == sysnum_cbret || sysnum == sysnum_continue ||
+        sysnum == sysnum_continue ||
         sysnum == sysnum_setcontext || sysnum == sysnum_mapcmf ||
         sysnum == sysnum_SetInformationProcess) {
         HANDLE process;
@@ -4570,13 +4532,6 @@
                                        allocation_size(pt->munmap_base, NULL), mc);
                 }
             }
-        } else if (sysnum == sysnum_cbret) {
-            handle_cbret(true/*syscall*/);
-        } else if (sysnum == sysnum_continue) {
-            client_handle_continue(drcontext, mc);
-            if (pt->in_seh)
-                pt->in_seh = false;
-            /* else, an APC */
         }
     }
 #else /* WINDOWS */
@@ -6721,11 +6676,9 @@
 
 #ifdef WINDOWS
 static void
-alloc_wrap_exception(void *wrapcxt, void OUT **user_data)
+alloc_handle_exception(void *drcontext)
 {
-    void *drcontext = dr_get_current_drcontext();
     cls_alloc_t *pt = (cls_alloc_t *) drmgr_get_cls_field(drcontext, cls_idx_alloc);
-    dr_mcontext_t *mc = drwrap_get_mcontext_ex(wrapcxt, DR_MC_GPR); /* don't need xmm */
     /* XXX PR 408545: preserve pre-fault values and watch NtContinue and
      * longjmp (unless longjmp from top handler still invokes
      * NtContinue) and determine whether returning to heap routine.  For
@@ -6737,25 +6690,15 @@
      */
     LOG(2, "Exception in app\n");
     pt->in_seh = true;
-    client_handle_exception(drcontext, mc);
-    client_handle_Ki(drcontext, drwrap_get_func(wrapcxt), mc, false/*!cb*/);
 }
 
 static void
-alloc_wrap_Ki(void *wrapcxt, void OUT **user_data)
+alloc_handle_continue(void *drcontext)
 {
-    app_pc pc = drwrap_get_func(wrapcxt);
-    void *drcontext = dr_get_current_drcontext();
-    dr_mcontext_t *mc = drwrap_get_mcontext_ex(wrapcxt, DR_MC_GPR); /* don't need xmm */
-    bool is_cb = (bool) *user_data;
-    ASSERT(pc != NULL, "alloc_hook: pc is NULL!");
-    /* our per-thread data is private per callback so we're already handling
-     * cbs (though we don't expect callbacks to interrupt heap routines).
-     * we handle exceptions interrupting heap routines here.
-     */
-    client_handle_Ki(drcontext, pc, mc, is_cb);
-    if (is_cb)
-        client_handle_callback(drcontext);
+    cls_alloc_t *pt = (cls_alloc_t *) drmgr_get_cls_field(drcontext, cls_idx_alloc);
+    if (pt->in_seh)
+        pt->in_seh = false;
+    /* else, an APC */
 }
 #endif /* WINDOWS */
 
@@ -7071,31 +7014,6 @@
     return drwrap_is_post_wrap(pc);
 }
 
-static dr_emit_flags_t
-alloc_event_bb_analysis(void *drcontext, void *tag, instrlist_t *bb,
-                        bool for_trace, bool translating, OUT void **user_data)
-{
-    /* Nothing anymore (used to have i#690 mod_pending_tree check here) */
-    return DR_EMIT_DEFAULT;
-}
-
-static dr_emit_flags_t
-alloc_event_bb_insert(void *drcontext, void *tag, instrlist_t *bb, instr_t *inst,
-                      bool for_trace, bool translating, void *user_data)
-{
-    app_pc pc = instr_get_app_pc(inst);
-    if (pc == NULL)
-        return DR_EMIT_DEFAULT;
-#ifdef WINDOWS
-    if (instr_get_opcode(inst) == OP_int &&
-        opnd_get_immed_int(instr_get_src(inst, 0)) == CBRET_INTERRUPT_NUM) {
-        dr_insert_clean_call(drcontext, bb, inst, (void *)handle_cbret, false,
-                             1, OPND_CREATE_INT32(0/*not syscall*/));
-    }
-#endif
-    return DR_EMIT_DEFAULT;
-}
-
 /***************************************************************************
  * Large malloc tree
  */
diff --git a/common/alloc.h b/common/alloc.h
index 7580dd2..42aedd3 100644
--- a/common/alloc.h
+++ b/common/alloc.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2010-2016 Google, Inc.  All rights reserved.
+ * Copyright (c) 2010-2017 Google, Inc.  All rights reserved.
  * Copyright (c) 2008-2010 VMware, Inc.  All rights reserved.
  * **********************************************************/
 
@@ -440,24 +440,6 @@
 void
 client_remove_malloc_on_destroy(HANDLE heap, byte *start, byte *end);
 
-/* called BEFORE drmgr has popped the child context */
-void
-client_handle_cbret(void *drcontext);
-
-/* called AFTER drmgr has pushed a new context */
-void
-client_handle_callback(void *drcontext);
-
-/* for is_cb, called AFTER drmgr has pushed a new context */
-void
-client_handle_Ki(void *drcontext, app_pc pc, dr_mcontext_t *mc, bool is_cb);
-
-void
-client_handle_exception(void *drcontext, dr_mcontext_t *mc);
-
-void
-client_handle_continue(void *drcontext, dr_mcontext_t *mc);
-
 bool
 is_in_seh(void *drcontext);
 
diff --git a/drheapstat/drheapstat.c b/drheapstat/drheapstat.c
index 53b0e33..21ae5d3 100644
--- a/drheapstat/drheapstat.c
+++ b/drheapstat/drheapstat.c
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2010-2015 Google, Inc.  All rights reserved.
+ * Copyright (c) 2010-2017 Google, Inc.  All rights reserved.
  * Copyright (c) 2009-2010 VMware, Inc.  All rights reserved.
  * **********************************************************/
 
@@ -1046,31 +1046,6 @@
     if (options.check_leaks)
         leak_remove_malloc_on_destroy(heap, start, end);
 }
-
-void
-client_handle_cbret(void *drcontext)
-{
-}
-
-void
-client_handle_callback(void *drcontext)
-{
-}
-
-void
-client_handle_Ki(void *drcontext, app_pc pc, dr_mcontext_t *mc, bool is_cb)
-{
-}
-
-void
-client_handle_exception(void *drcontext, dr_mcontext_t *mc)
-{
-}
-
-void
-client_handle_continue(void *drcontext, dr_mcontext_t *mc)
-{
-}
 #endif /* WINDOWS */
 
 void
diff --git a/drmemory/alloc_drmem.c b/drmemory/alloc_drmem.c
index 0277354..02c60a5 100644
--- a/drmemory/alloc_drmem.c
+++ b/drmemory/alloc_drmem.c
@@ -58,10 +58,6 @@
 static hashtable_t alloc_stack_table;
 
 #ifdef UNIX
-/* Track all signal handlers registered by app so we can instrument them */
-#define SIGHAND_HASH_BITS 6
-hashtable_t sighand_table;
-
 /* PR 418629: to determine stack bounds accurately we track anon mmaps */
 static rb_tree_t *mmap_tree;
 static void *mmap_tree_lock; /* maybe rbtree should support internal synch */
@@ -220,7 +216,6 @@
                       (bool (*)(void*, void*)) packed_callstack_cmp);
 
 #ifdef UNIX
-    hashtable_init(&sighand_table, SIGHAND_HASH_BITS, HASH_INTPTR, false/*!strdup*/);
     mmap_tree = rb_tree_create(NULL);
     mmap_tree_lock = dr_mutex_create();
 #endif
@@ -257,7 +252,6 @@
     alloc_exit(); /* must be before deleting alloc_stack_table */
     hashtable_delete_with_stats(&alloc_stack_table, "alloc stack table");
 #ifdef UNIX
-    hashtable_delete(&sighand_table);
     rb_tree_destroy(mmap_tree);
     dr_mutex_destroy(mmap_tree_lock);
 #endif
@@ -1207,51 +1201,14 @@
 #endif
 
 #ifdef WINDOWS
-void
-client_handle_cbret(void *drcontext)
-{
-    umbra_shadow_memory_info_t info;
-    dr_mcontext_t mc; /* do not init whole thing: memset is expensive */
-    byte *sp;
-    cls_drmem_t *cpt_parent = (cls_drmem_t *)
-        drmgr_get_parent_cls_field(drcontext, cls_idx_drmem);
-    if (cpt_parent == NULL) /* DR took over in middle of callback */
-        return;
-    if (!options.shadowing)
-        return;
-    syscall_handle_cbret(drcontext);
-
-    if (!options.check_stack_bounds)
-        return;
-
-    mc.size = sizeof(mc);
-    mc.flags = DR_MC_CONTROL; /* only need xsp */
-    dr_get_mcontext(drcontext, &mc);
-    sp = (byte *) mc.xsp;
-    LOG(2, "cbret: marking stack "PFX"-"PFX" as unaddressable\n",
-        sp, cpt_parent->pre_callback_esp);
-    LOG(3, "cbret: cpt_parent is "PFX", cpt is "PFX"\n",
-        cpt_parent, drmgr_get_cls_field(drcontext, cls_idx_drmem));
-    umbra_shadow_memory_info_init(&info);
-    for (; sp < cpt_parent->pre_callback_esp; sp++)
-        shadow_set_byte(&info, sp, SHADOW_UNADDRESSABLE);
-}
-
-void
-client_handle_callback(void *drcontext)
-{
-    LOG(2, "Entering windows callback handler\n");
-    syscall_handle_callback(drcontext);
-}
-
-void
-client_handle_Ki(void *drcontext, app_pc pc, dr_mcontext_t *mc, bool is_cb)
+static void
+handle_Ki(void *drcontext, app_pc pc, byte *new_xsp, bool is_cb)
 {
     /* The kernel has placed some data on the stack.  We assume we're
      * on the same thread stack.  FIXME: check those assumptions by checking
      * default stack bounds.
      */
-    app_pc sp = (app_pc) mc->xsp;
+    app_pc sp = new_xsp;
     TEB *teb = get_TEB();
     app_pc base_esp = teb->StackBase;
     app_pc stop_esp = NULL;
@@ -1273,7 +1230,7 @@
             sp += 4; /* 4 bytes map to one so skip to next */
         else
             sp++;
-        if (sp - (byte *) mc->xsp >= TYPICAL_STACK_MIN_SIZE) {
+        if (sp - new_xsp >= TYPICAL_STACK_MIN_SIZE) {
             ASSERT(false, "kernel-placed data on stack too large: error?");
             break; /* abort */
         }
@@ -1281,7 +1238,7 @@
     ASSERT(ALIGNED(sp, 4), "stack not aligned");
 
     LOG(2, "Ki routine "PFX": marked stack "PFX"-"PFX" as defined\n",
-        pc, mc->xsp, sp);
+        pc, new_xsp, sp);
 
     if (is_cb) {
         /* drmgr already pushed a new context */
@@ -1296,21 +1253,55 @@
     }
 }
 
-void
-client_handle_exception(void *drcontext, dr_mcontext_t *mc)
+static void
+handle_callback(void *drcontext)
+{
+    LOG(2, "Entering windows callback handler\n");
+    syscall_handle_callback(drcontext);
+}
+
+static void
+handle_cbret(void *drcontext, const dr_kernel_xfer_info_t *xfer_info)
+{
+    umbra_shadow_memory_info_t info;
+    byte *sp = (byte *) xfer_info->source_mcontext->xsp;
+    cls_drmem_t *cpt_parent = (cls_drmem_t *)
+        drmgr_get_parent_cls_field(drcontext, cls_idx_drmem);
+    if (cpt_parent == NULL) /* DR took over in middle of callback */
+        return;
+    if (!options.shadowing)
+        return;
+    syscall_handle_cbret(drcontext);
+
+    if (!options.check_stack_bounds)
+        return;
+
+    ASSERT(cpt_parent->pre_callback_esp == (byte *)xfer_info->target_xsp,
+           "cb xsp mismatch");
+    LOG(2, "cbret: marking stack "PFX"-"PFX" as unaddressable\n",
+        sp, cpt_parent->pre_callback_esp);
+    LOG(3, "cbret: cpt_parent is "PFX", cpt is "PFX"\n",
+        cpt_parent, drmgr_get_cls_field(drcontext, cls_idx_drmem));
+    umbra_shadow_memory_info_init(&info);
+    for (; sp < cpt_parent->pre_callback_esp; sp++)
+        shadow_set_byte(&info, sp, SHADOW_UNADDRESSABLE);
+}
+
+static void
+handle_exception(void *drcontext)
 {
     cls_drmem_t *cpt = (cls_drmem_t *) drmgr_get_cls_field(drcontext, cls_idx_drmem);
     cpt->heap_critsec = NULL;
 }
 
-void
-client_handle_continue(void *drcontext, dr_mcontext_t *mc)
+static void
+handle_continue(void *drcontext)
 {
+    /* We rely on this running *before* alloc.c's so is_in_seh() is correct. */
     cls_drmem_t *cpt = (cls_drmem_t *) drmgr_get_cls_field(drcontext, cls_idx_drmem);
     if (is_in_seh(drcontext)) {
         cpt->heap_critsec = NULL;
-    }
-    /* else it was an APC */
+    } /* else it was an APC */
 }
 #endif /* WINDOWS */
 
@@ -1418,6 +1409,7 @@
     if (!options.check_stack_bounds)
         return;
     if (sysnum == sysnum_continue) {
+        /* XXX: we could move this to the kernel xfer event */
         CONTEXT *cxt = (CONTEXT *) dr_syscall_get_param(drcontext, 0);
         umbra_shadow_memory_info_t info;
         umbra_shadow_memory_info_init(&info);
@@ -1483,110 +1475,7 @@
     }
 #else
     cls_drmem_t *cpt = (cls_drmem_t *) drmgr_get_cls_field(drcontext, cls_idx_drmem);
-    if (sysnum == IF_MACOS_ELSE(SYS_sigaction, SYS_rt_sigaction)
-        IF_NOT_X64(|| sysnum == SYS_sigaction IF_LINUX(|| sysnum == SYS_signal))) {
-        /* PR 406333: linux signal delivery.
-         * For delivery: signal event doesn't help us since have to predict
-         * which stack and size of frame: should intercept handler registration
-         * and wait until enter a handler.  Can ignore SIG_IGN and SIG_DFL.
-         */
-        void *handler = NULL;
-# ifdef MACOS
-        if (sysnum == SYS_sigaction) {
-            /* 2nd arg is ptr to struct w/ app handler as 1st field, but libc
-             * trampoline as 2nd field.
-             */
-            safe_read((byte *)syscall_get_param(drcontext, 1) + sizeof(handler),
-                      sizeof(handler), &handler);
-        }
-# else
-        if (sysnum == SYS_rt_sigaction) {
-            /* 2nd arg is ptr to struct w/ handler as 1st field */
-            safe_read((void *)syscall_get_param(drcontext, 1), sizeof(handler), &handler);
-        }
-#  ifdef X86_32
-        else if (sysnum == SYS_sigaction) {
-            /* 2nd arg is ptr to struct w/ handler as 1st field */
-            safe_read((void *)syscall_get_param(drcontext, 1), sizeof(handler), &handler);
-        }
-        else if (sysnum == SYS_signal) {
-            /* 2nd arg is handler */
-            handler = (void *) syscall_get_param(drcontext, 1);
-        }
-#  endif
-# endif
-        if (handler != NULL) {
-            LOG(2, "SYS_rt_sigaction/etc.: new handler "PFX"\n", handler);
-            /* We make a simplifying assumption: handler code is only used for
-             * signal handling.  We could keep a counter and inc on every success
-             * and dec on failure and on change to IGN/DFL and remove when it hits
-             * 0 -- but might have races where a final signal comes in.  We assume
-             * we can leave our instrumentation there and if it is executed
-             * executed for non-signals our check for prior signal event
-             * is good enough to distinguish.
-             */
-            if (handler != SIG_IGN && handler != SIG_DFL)
-                hashtable_add(&sighand_table, (void*)handler, (void*)1);
-        } else {
-            LOG(2, "SYS_rt_sigaction/etc.: bad handler\n");
-        }
-    }
-    else if ((sysnum == IF_MACOS_ELSE(SYS_sigreturn, SYS_rt_sigreturn)
-              IF_NOT_X64(|| sysnum == SYS_sigreturn)) &&
-             options.check_stack_bounds) {
-#ifdef LINUX
-        /* PR 406333: linux signal delivery.
-         * On sigreturn, whether altstack or not, invalidate
-         * where frame was.  Either need to record at handler entry the base of
-         * the frame, or at sigreturn determine target esp: the former is
-         * complicated by nested signals and signals that use longjmp, so
-         * we do the latter.
-         *
-         * Longjmp exiting a signal requires no special handling when it
-         * goes up the stack (or down) since we see the xsp assignment.
-         */
-        byte *sp = (byte *)mc.xsp;
-        byte *new_sp;
-        struct sigcontext *sc;
-        if (sysnum == IF_MACOS_ELSE(SYS_sigreturn, SYS_rt_sigreturn)) {
-            /* first, skip signum and siginfo ptr to get ucontext ptr */
-            struct ucontext *ucxt;
-            if (safe_read(sp + sizeof(int) + sizeof(struct siginfo*),
-                          sizeof(ucxt), &ucxt)) {
-                sc = (struct sigcontext *) &ucxt->uc_mcontext;
-            } else {
-                LOG(1, "WARNING: can't read sc pointer at sigreturn\n");
-                sc = NULL;
-            }
-        } else {
-            sc = (struct sigcontext *) sp;
-        }
-        if (sc != NULL &&
-            safe_read(&sc->IF_X64_ELSE(rsp, IF_ARM_ELSE(arm_sp, esp)),
-                      sizeof(new_sp), &new_sp)) {
-            byte *unaddr_top = NULL;
-            if (new_sp > sp && (size_t)(new_sp - sp) < MAX_SIGNAL_FRAME_SIZE) {
-                unaddr_top = new_sp;
-            } else if (cpt->sigaltstack != NULL && cpt->sigaltstack > sp &&
-                       (size_t)(cpt->sigaltstack - sp) < cpt->sigaltsize) {
-                /* transitioning from sigaltstack to regular stack */
-                unaddr_top = cpt->sigaltstack;
-            } else {
-                LOG(2, "at sigreturn but new sp "PFX" irregular vs "PFX"\n", new_sp, sp);
-            }
-            if (unaddr_top != NULL) {
-                LOG(2, "at sigreturn: marking frame "PFX"-"PFX" unaddressable\n",
-                    sp, unaddr_top);
-                shadow_set_range(sp, unaddr_top, SHADOW_UNADDRESSABLE);
-            }
-        } else {
-            LOG(1, "WARNING: can't read sc->xsp at sigreturn\n");
-        }
-#else
-        /* FIXME i#1438: add Mac handling */
-#endif
-    }
-    else if (sysnum == SYS_sigaltstack) {
+    if (sysnum == SYS_sigaltstack) {
         /* PR 406333: linux signal delivery */
         stack_t stk;
         cpt->prev_sigaltstack = cpt->sigaltstack;
@@ -1631,19 +1520,7 @@
     cls_drmem_t *cpt = (cls_drmem_t *) drmgr_get_cls_field(drcontext, cls_idx_drmem);
     if (!options.shadowing)
         return;
-    if (sysnum == IF_MACOS_ELSE(SYS_sigreturn, SYS_rt_sigaction)
-        IF_NOT_X64(|| sysnum == SYS_sigaction IF_LINUX(|| sysnum == SYS_signal))) {
-        if (result != 0) {
-            LOG(2, "SYS_rt_sigaction/etc. FAILED for handler "PFX"\n",
-                  syscall_get_param(drcontext, 1));
-            /* See notes above: if we had a counter we could remove from
-             * sighand_table if there were no successfull registrations --
-             * but we assume handler code is only used for signals so
-             * we just leave in the table and rely on our pre-event check.
-             */
-        }
-    }
-    else if (sysnum == SYS_sigaltstack) {
+    if (sysnum == SYS_sigaltstack) {
         if (result != 0) {
             /* We can't query the OS since DR is hiding the real sigaltstack,
              * so we record the prev value
@@ -1670,7 +1547,7 @@
 }
 
 static void
-at_signal_handler(void)
+handle_signal_delivery(void *drcontext, reg_t dst_xsp)
 {
     /* PR 406333: linux signal delivery.
      * Need to know extent of frame: could record xsp in signal event,
@@ -1689,7 +1566,6 @@
      * at the very base of a stack and we could walk off onto
      * adjacent memory: we ignore that.
      */
-    void *drcontext = dr_get_current_drcontext();
     cls_drmem_t *cpt = (cls_drmem_t *) drmgr_get_cls_field(drcontext, cls_idx_drmem);
     dr_mcontext_t mc; /* do not init whole thing: memset is expensive */
     byte *sp, *stop;
@@ -1722,16 +1598,61 @@
     LOG(2, "signal handler: marked new frame defined "PFX"-"PFX"\n", mc.xsp, sp);
 }
 
-void
-instrument_signal_handler(void *drcontext, instrlist_t *bb, instr_t *inst,
-                          app_pc pc)
+static void
+handle_signal_return(void *drcontext, const dr_mcontext_t *src_mc, byte *new_sp)
 {
-    LOG(3, "instrumenting signal handler "PFX"\n", pc);
-    dr_insert_clean_call(drcontext, bb, inst, (void *)at_signal_handler,
-                         false, 0);
+    ASSERT(options.shadowing && options.check_stack_bounds, "incorrectly called");
+    ASSERT(src_mc != NULL && TEST(DR_MC_CONTROL, src_mc->flags),
+           "src_mc should always exist for sigreturn");
+    cls_drmem_t *cpt = (cls_drmem_t *) drmgr_get_cls_field(drcontext, cls_idx_drmem);
+    byte *sp = (byte *)src_mc->xsp;
+    byte *unaddr_top = NULL;
+    if (new_sp > sp && (size_t)(new_sp - sp) < MAX_SIGNAL_FRAME_SIZE) {
+        unaddr_top = new_sp;
+    } else if (cpt->sigaltstack != NULL && cpt->sigaltstack > sp &&
+               (size_t)(cpt->sigaltstack - sp) < cpt->sigaltsize) {
+        /* transitioning from sigaltstack to regular stack */
+        unaddr_top = cpt->sigaltstack;
+    } else {
+        LOG(2, "at sigreturn but new sp "PFX" irregular vs "PFX"\n", new_sp, sp);
+    }
+    if (unaddr_top != NULL) {
+        LOG(2, "at sigreturn: marking frame "PFX"-"PFX" unaddressable\n",
+            sp, unaddr_top);
+        shadow_set_range(sp, unaddr_top, SHADOW_UNADDRESSABLE);
+    }
 }
 #endif /* UNIX */
 
+void
+event_kernel_xfer(void *drcontext, const dr_kernel_xfer_info_t *info)
+{
+#ifdef UNIX
+    if (!options.shadowing || !options.check_stack_bounds)
+        return;
+    if (info->type == DR_XFER_SIGNAL_DELIVERY)
+        handle_signal_delivery(drcontext, info->target_xsp);
+    else if (info->type == DR_XFER_SIGNAL_RETURN)
+        handle_signal_return(drcontext, info->source_mcontext, (byte *)info->target_xsp);
+#else
+    if (info->type == DR_XFER_CALLBACK_DISPATCHER) {
+        handle_Ki(drcontext, info->target_pc, (byte*)info->target_xsp, true);
+        handle_callback(drcontext);
+    }
+    else if (info->type == DR_XFER_CALLBACK_RETURN)
+        handle_cbret(drcontext, info);
+    else if (info->type == DR_XFER_APC_DISPATCHER ||
+             info->type == DR_XFER_RAISE_DISPATCHER)
+        handle_Ki(drcontext, info->target_pc, (byte*)info->target_xsp, false);
+    else if (info->type == DR_XFER_EXCEPTION_DISPATCHER) {
+        handle_exception(drcontext);
+        handle_Ki(drcontext, info->target_pc, (byte*)info->target_xsp, false);
+    }
+    else if (info->type == DR_XFER_CONTINUE)
+        handle_continue(drcontext);
+#endif
+}
+
 /***************************************************************************
  * ADDRESSABILITY
  */
diff --git a/drmemory/alloc_drmem.h b/drmemory/alloc_drmem.h
index 0906bf2..de9c1e0 100644
--- a/drmemory/alloc_drmem.h
+++ b/drmemory/alloc_drmem.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2013-2015 Google, Inc.  All rights reserved.
+ * Copyright (c) 2013-2017 Google, Inc.  All rights reserved.
  * Copyright (c) 2008-2009 VMware, Inc.  All rights reserved.
  * **********************************************************/
 
@@ -29,10 +29,6 @@
 
 #include "callstack.h" /* app_loc_t */
 
-#ifdef UNIX
-extern hashtable_t sighand_table;
-#endif
-
 void
 alloc_drmem_init(void);
 
@@ -43,14 +39,13 @@
 check_unaddressable_exceptions(bool write, app_loc_t *loc, app_pc addr, uint sz,
                                bool addr_on_stack, dr_mcontext_t *mc);
 
+void
+event_kernel_xfer(void *drcontext, const dr_kernel_xfer_info_t *info);
+
 #ifdef UNIX
 dr_signal_action_t
 event_signal_alloc(void *drcontext, dr_siginfo_t *info);
 
-void
-instrument_signal_handler(void *drcontext, instrlist_t *bb, instr_t *inst,
-                          app_pc pc);
-
 bool
 mmap_anon_lookup(byte *addr, byte **start OUT, size_t *size OUT);
 #endif
diff --git a/drmemory/drmemory.c b/drmemory/drmemory.c
index 4429e2b..34f8bde 100644
--- a/drmemory/drmemory.c
+++ b/drmemory/drmemory.c
@@ -1213,8 +1213,8 @@
      * can set base part of stack for primary thread.
      * For drinject, stack is clean, except on Vista where a few words
      * are above esp.
-     * Note that this is the start esp: the APC esp is handled in
-     * client_handle_Ki.
+     * Note that this is the start esp, due to DRi#2718: the APC esp is handled in
+     * handle_Ki().
      */
     IF_DEBUG(ok = )
         dr_get_mcontext(drcontext, &mc);
@@ -1935,6 +1935,7 @@
     dr_register_nudge_event(event_nudge, client_id);
     if (options.soft_kills)
         drx_register_soft_kills(event_soft_kill);
+    drmgr_register_kernel_xfer_event(event_kernel_xfer);
 #ifdef UNIX
     dr_register_fork_init_event(event_fork);
     drmgr_register_signal_event(event_signal);
diff --git a/drmemory/instru.c b/drmemory/instru.c
index 077fa08..aeb11ed 100644
--- a/drmemory/instru.c
+++ b/drmemory/instru.c
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2010-2016 Google, Inc.  All rights reserved.
+ * Copyright (c) 2010-2017 Google, Inc.  All rights reserved.
  * Copyright (c) 2008-2010 VMware, Inc.  All rights reserved.
  * **********************************************************/
 
@@ -1165,13 +1165,6 @@
         }
     }
 
-#if defined(UNIX) && defined(TOOL_DR_MEMORY)
-    if (options.shadowing &&
-        hashtable_lookup(&sighand_table, (void*)pc) != NULL) {
-        instrument_signal_handler(drcontext, bb, inst, pc);
-    }
-#endif
-
     if (INSTRUMENT_MEMREFS()) {
         /* We want to spill AFTER any clean call in case it changes mcontext */
         /* XXX: examine this: how make it more in spirit of drmgr? */
diff --git a/drmemory/spill.c b/drmemory/spill.c
index b1f0dca..a0ed87c 100644
--- a/drmemory/spill.c
+++ b/drmemory/spill.c
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2010-2015 Google, Inc.  All rights reserved.
+ * Copyright (c) 2010-2017 Google, Inc.  All rights reserved.
  * Copyright (c) 2008-2010 VMware, Inc.  All rights reserved.
  * **********************************************************/
 
@@ -1347,13 +1347,16 @@
 {
     instr_t *inst = instrlist_first(bb);
 #ifdef DEBUG
-    app_pc prev_pc = dr_fragment_app_pc(tag);
-    ASSERT(prev_pc != NULL, "bb tag must not be NULL");
+    /* We look at instr pc, not the tag, to handle displaced code such
+     * as for the vsyscall hook.
+     */
+    app_pc prev_pc = instr_get_app_pc(instrlist_first_app(bb));
+    ASSERT(prev_pc != NULL, "bb first app pc must not be NULL");
     /* i#260 and i#1466: bbs must be contiguous */
     if (inst != NULL && whole_bb_spills_enabled() &&
         /* bi->is_repstr_to_loop is set in app2app and may mess up the instr pc */
         !bi->is_repstr_to_loop) {
-        for (; inst != NULL; inst = instr_get_next(inst)) {
+        for (; inst != NULL; inst = instr_get_next_app(inst)) {
             app_pc cur_pc = instr_get_app_pc(inst);
             if (cur_pc == NULL)
                 continue;
diff --git a/dynamorio b/dynamorio
index 38338ae..fa46538 160000
--- a/dynamorio
+++ b/dynamorio
@@ -1 +1 @@
-Subproject commit 38338ae437dbbb5998bf9b8f3eebc394f063b076
+Subproject commit fa4653857b09c4983219772159c0f208394471c7