Add logic for patching calls to the x86-64 vsyscall page

This is necessary on Linux 3.1 because the vsyscalls now make real
syscalls to the kernel, rather than just reading memory as they
usually did before, which means they fail in seccomp mode.

Although the vsyscall page is deprecated, glibc still contains some
calls to it.  We detect and patch the instruction sequence that
matters.  (We do this because, unfortunately, the kernel does not let
us change the permissions on the vsyscall page to patch it.)

glibc still contains a code path that could call vgettimeofday via a
different instruction sequence, which is much harder to patch, and we
don't try to.  libc.so has code to store vgettimeofday's address
(0xffffffffff600000) in TLS, but in practice this code path is not
used when the vdso is present.

To apply the patch we replace the instructions with a syscall, which
later gets re-patched to be a jump.

BUG=http://code.google.com/p/seccompsandbox/issues/detail?id=17
TEST=test_patching_vsyscall_* on any Linux version,
  plus test_time and test_sched_getcpu on Linux 3.1

Review URL: http://codereview.chromium.org/8605003

git-svn-id: http://seccompsandbox.googlecode.com/svn/trunk@178 55e79e8e-603c-11de-8c10-5fe6993ea61f
diff --git a/library.cc b/library.cc
index 5845943..07a41cc 100644
--- a/library.cc
+++ b/library.cc
@@ -364,6 +364,54 @@
   Sandbox::die("Insufficient space to intercept system call");
 }
 
+#if defined(__x86_64__)
+static bool isCallToVsyscallPage(char* code) {
+  // Look for these instructions, which are a call to the x86-64
+  // vsyscall page, which the kernel puts at a fixed address:
+  //
+  //   48 c7 c0 00 XX 60 ff    mov    $0xffffffffff60XX00,%rax
+  //   ff d0                   callq  *%rax
+  //
+  // This will not catch all calls to the vsyscall page, but it
+  // handles the important cases that glibc contains.  The vsyscall
+  // page is deprecated, so it is unlikely that new instruction
+  // sequences for calling it will be introduced.
+  return (code[0] == '\x48' &&
+          code[1] == '\xc7' &&
+          code[2] == '\xc0' &&
+          code[3] == '\x00' &&
+          (code[4] == '\x00' || code[4] == '\x04' || code[4] == '\x08') &&
+          code[5] == '\x60' &&
+          code[6] == '\xff' &&
+          code[7] == '\xff' &&
+          code[8] == '\xd0');
+}
+
+static void patchCallToVsyscallPage(char* code) {
+  // We replace the mov+callq with these instructions:
+  //
+  //   b8 XX XX XX XX   mov $X, %eax  // where X is the syscall number
+  //   0f 05            syscall 
+  //   90               nop
+  //   90               nop
+  //
+  // The syscall instruction will later be patched by the general case.
+  if (code[4] == '\x00') {
+    // Use __NR_gettimeofday == 96 == 0x60.
+    const char replacement[] = "\xb8\x60\x00\x00\x00\x0f\x05\x90\x90";
+    memcpy(code, replacement, sizeof(replacement) - 1);
+  } else if (code[4] == '\x04') {
+    // Use __NR_time == 201 == 0xc9.
+    const char replacement[] = "\xb8\xc9\x00\x00\x00\x0f\x05\x90\x90";
+    memcpy(code, replacement, sizeof(replacement) - 1);
+  } else if (code[4] == '\x08') {
+    // Use __NR_getcpu == 309 == 0x135.
+    const char replacement[] = "\xb8\x35\x01\x00\x00\x0f\x05\x90\x90";
+    memcpy(code, replacement, sizeof(replacement) - 1);
+  }
+}
+#endif
+
 void Library::patchSystemCallsInFunction(const Maps* maps, int vsys_offset,
                                          char* start, char* end,
                                          char** extraSpace, int* extraLength) {
@@ -392,6 +440,11 @@
   int codeIdx = 0;
   char* ptr = start;
   while (ptr < end) {
+    #if defined(__x86_64__)
+    if (isCallToVsyscallPage(ptr)) {
+      patchCallToVsyscallPage(ptr);
+    }
+    #endif
     // Keep a ring-buffer of the last few instruction in order to find the
     // correct place to patch the code.
     char *mod_rm;
@@ -1031,7 +1084,8 @@
   for (char *ptr = start; ptr < stop; ptr++) {
     #if defined(__x86_64__)
     if ((*ptr == '\x0F' && ptr[1] == '\x05' /* SYSCALL */) ||
-        (isVDSO_ && *ptr == '\xFF')) {
+        (isVDSO_ && *ptr == '\xFF') ||
+        isCallToVsyscallPage(ptr)) {
     #elif defined(__i386__)
     if ((*ptr   == '\xCD' && ptr[1] == '\x80' /* INT $0x80 */) ||
         (*ptr   == '\x65' && ptr[1] == '\xFF' &&
diff --git a/tests/test_patching.cc b/tests/test_patching.cc
index bb1babb..448bdac 100644
--- a/tests/test_patching.cc
+++ b/tests/test_patching.cc
@@ -3,6 +3,8 @@
 // found in the LICENSE file.
 
 #include <fcntl.h>
+#include <stdlib.h>
+#include <sys/time.h>
 
 #include "library.h"
 #include "sandbox.h"
@@ -49,3 +51,82 @@
   StartSeccompSandbox();
   CHECK(my_getpid() == pid);
 }
+
+#if defined(__x86_64__)
+
+// These test cases test patching calls to the vsyscall page, which is
+// present on x86-64 only.
+
+// The timer tests below could fail on a heavily loaded machine, but
+// we make a generous allowance for this.  They could also fail if the
+// clock is changed while the test is running.
+const int kMaxTime = 30; // Time in seconds
+
+extern "C" int my_vgettimeofday(struct timeval *tv, struct timezone *tz);
+extern char my_vgettimeofday_end[];
+
+extern "C" int my_vtime(time_t *time);
+extern char my_vtime_end[];
+
+extern "C" int my_vgetcpu(unsigned *cpu, unsigned *node, void *tcache);
+extern char my_vgetcpu_end[];
+
+void check_patching_vsyscall(char *func, char *func_end) {
+  patch_range(func, func_end);
+  CHECK(func[0] == '\x48');  // 48 83 ec 08      sub $8, %rsp (unmodified)
+  CHECK(func[1] == '\x83');
+  CHECK(func[2] == '\xec');
+  CHECK(func[3] == '\x08');
+  CHECK(func[4] == '\xe9');  // e9 XX XX XX XX   jmp X
+  CHECK(func[9] == '\x90');  // 90               nop
+  CHECK(func[10] == '\x90'); // 90               nop
+  CHECK(func[11] == '\x90'); // 90               nop
+  CHECK(func[12] == '\x90'); // 90               nop
+  CHECK(func[13] == '\x48'); // 48 83 c4 08      add $8, %rsp (unmodified)
+  CHECK(func[14] == '\x83');
+  CHECK(func[15] == '\xc4');
+  CHECK(func[16] == '\x08');
+  CHECK(func[17] == '\xc3'); // c3               ret (unmodified)
+}
+
+TEST(test_patching_vsyscall_gettimeofday) {
+  struct timeval time1;
+  struct timeval time2;
+  CHECK_SUCCEEDS(gettimeofday(&time1, NULL) == 0);
+  CHECK(my_vgettimeofday(&time2, NULL) == 0);
+  CHECK(time1.tv_sec <= time2.tv_sec && time2.tv_sec < time1.tv_sec + kMaxTime);
+
+  check_patching_vsyscall((char *) my_vgettimeofday, my_vgettimeofday_end);
+
+  StartSeccompSandbox();
+  CHECK(my_vgettimeofday(&time2, NULL) == 0);
+  CHECK(time1.tv_sec <= time2.tv_sec && time2.tv_sec < time1.tv_sec + kMaxTime);
+}
+
+TEST(test_patching_vsyscall_time) {
+  time_t time1;
+  time_t time2;
+  CHECK_SUCCEEDS((time1 = time(NULL)) != -1);
+  time2 = time(NULL);
+  CHECK(time1 <= time2 && time2 < time1 + kMaxTime);
+
+  check_patching_vsyscall((char *) my_vtime, my_vtime_end);
+
+  StartSeccompSandbox();
+  time2 = time(NULL);
+  CHECK(time1 <= time2 && time2 < time1 + kMaxTime);
+}
+
+TEST(test_patching_vsyscall_getcpu) {
+  CHECK(my_vgetcpu(NULL, NULL, NULL) == 0);
+
+  check_patching_vsyscall((char *) my_vgetcpu, my_vgetcpu_end);
+
+  StartSeccompSandbox();
+  // glibc's sched_getcpu() could still succeed if it goes via the
+  // vdso and just reads memory, but my_vgetcpu() is always redirected
+  // through the sandbox's handler and is rejected.
+  CHECK(my_vgetcpu(NULL, NULL, NULL) == -ENOSYS);
+}
+
+#endif
diff --git a/tests/test_patching_input.S b/tests/test_patching_input.S
index 3e7126d..e240603 100644
--- a/tests/test_patching_input.S
+++ b/tests/test_patching_input.S
@@ -22,5 +22,43 @@
         ret
 my_getpid_end:
 
+
+        // These routines call the vsyscall page, which is present on
+        // x86-64 only.
+
+#if defined(__x86_64__)
+
+        .global my_vgettimeofday
+        .global my_vgettimeofday_end
+my_vgettimeofday:
+        sub $8, %rsp  // Align the stack
+        mov $0xffffffffff600000, %rax
+        call *%rax
+        add $8, %rsp
+        ret
+my_vgettimeofday_end:
+
+        .global my_vtime
+        .global my_vtime_end
+my_vtime:
+        sub $8, %rsp  // Align the stack
+        mov $0xffffffffff600400, %rax
+        call *%rax
+        add $8, %rsp
+        ret
+my_vtime_end:
+
+        .global my_vgetcpu
+        .global my_vgetcpu_end
+my_vgetcpu:
+        sub $8, %rsp  // Align the stack
+        mov $0xffffffffff600800, %rax
+        call *%rax
+        add $8, %rsp
+        ret
+my_vgetcpu_end:
+
+#endif
+
         // Tell Linux not to disable no-execute protection for the process.
         .section .note.GNU-stack,"",@progbits