blob: e952457099ed6718d05c6f9e967fdb897285f425 [file] [log] [blame]
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "sandbox_impl.h"
#include "library.h"
#include "syscall_entrypoint.h"
#include "system_call_table.h"
namespace playground {
// Global variables
int Sandbox::proc_self_maps_ = -1;
enum Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;
int Sandbox::pid_;
int Sandbox::processFdPub_;
int Sandbox::cloneFdPub_
// This is necessary to locate the symbol from assembly code on
// x86-64 (with %rip-relative addressing) in order for this to work
// in relocatable code (a .so or a PIE). On i386 this is not
// necessary but it does not hurt.
__attribute__((visibility("internal")));
Sandbox::SysCalls::kernel_sigaction Sandbox::sa_segv_;
Sandbox::ProtectedMap Sandbox::protectedMap_;
std::vector<SecureMem::Args*> Sandbox::secureMemPool_;
CreateTrustedThreadFunc g_create_trusted_thread =
Sandbox::createTrustedThread;
bool Sandbox::sendFd(int transport, int fd0, int fd1, const void* buf,
size_t len) {
int fds[2], count = 0;
if (fd0 >= 0) { fds[count++] = fd0; }
if (fd1 >= 0) { fds[count++] = fd1; }
if (!count) {
return false;
}
char cmsg_buf[CMSG_SPACE(count*sizeof(int))];
memset(cmsg_buf, 0, sizeof(cmsg_buf));
struct SysCalls::kernel_iovec iov[2] = { { 0 } };
struct SysCalls::kernel_msghdr msg = { 0 };
int dummy = 0;
iov[0].iov_base = &dummy;
iov[0].iov_len = sizeof(dummy);
if (buf && len > 0) {
iov[1].iov_base = const_cast<void *>(buf);
iov[1].iov_len = len;
}
msg.msg_iov = iov;
msg.msg_iovlen = (buf && len > 0) ? 2 : 1;
msg.msg_control = cmsg_buf;
msg.msg_controllen = CMSG_LEN(count*sizeof(int));
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(count*sizeof(int));
memcpy(CMSG_DATA(cmsg), fds, count*sizeof(int));
SysCalls sys;
return NOINTR_SYS(sys.sendmsg(transport, &msg, 0)) ==
(ssize_t)(sizeof(dummy) + ((buf && len > 0) ? len : 0));
}
bool Sandbox::getFd(int transport, int* fd0, int* fd1, void* buf, size_t*len) {
int count = 0;
int *err = NULL;
if (fd0) {
count++;
err = fd0;
*fd0 = -1;
}
if (fd1) {
if (!count++) {
err = fd1;
}
*fd1 = -1;
}
if (!count) {
return false;
}
char cmsg_buf[CMSG_SPACE(count*sizeof(int))];
memset(cmsg_buf, 0, sizeof(cmsg_buf));
struct SysCalls::kernel_iovec iov[2] = { { 0 } };
struct SysCalls::kernel_msghdr msg = { 0 };
iov[0].iov_base = err;
iov[0].iov_len = sizeof(int);
if (buf && len && *len > 0) {
iov[1].iov_base = buf;
iov[1].iov_len = *len;
}
msg.msg_iov = iov;
msg.msg_iovlen = (buf && len && *len > 0) ? 2 : 1;
msg.msg_control = cmsg_buf;
msg.msg_controllen = CMSG_LEN(count*sizeof(int));
SysCalls sys;
ssize_t bytes = NOINTR_SYS(sys.recvmsg(transport, &msg, 0));
if (len) {
*len = bytes > (int)sizeof(int) ?
bytes - sizeof(int) : 0;
}
if (bytes != (ssize_t)(sizeof(int) + ((buf && len && *len > 0) ? *len : 0))){
*err = bytes >= 0 ? 0 : -EBADF;
return false;
}
if (*err) {
// "err" is the first four bytes of the payload. If these are non-zero,
// the sender on the other side of the socketpair sent us an errno value.
// We don't expect to get any file handles in this case.
return false;
}
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
if ((msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) ||
!cmsg ||
cmsg->cmsg_level != SOL_SOCKET ||
cmsg->cmsg_type != SCM_RIGHTS ||
cmsg->cmsg_len != CMSG_LEN(count*sizeof(int))) {
*err = -EBADF;
return false;
}
if (fd1) { *fd1 = ((int *)CMSG_DATA(cmsg))[--count]; }
if (fd0) { *fd0 = ((int *)CMSG_DATA(cmsg))[--count]; }
return true;
}
void segvSignalHandler(int signo, Sandbox::SysCalls::siginfo *context,
void *unused)
asm("playground$segvSignalHandler") INTERNAL;
void Sandbox::setupSignalHandlers() {
// Set SIGCHLD to SIG_DFL so that waitpid() can work
SysCalls sys;
struct SysCalls::kernel_sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler_ = SIG_DFL;
sys.sigaction(SIGCHLD, &sa, NULL);
// Set up SEGV handler for dealing with RDTSC instructions, system calls
// that have been rewritten to use INT0, for sigprocmask() emulation, for
// the creation of threads, and for user-provided SEGV handlers.
sa.sa_sigaction_ = segvSignalHandler;
sa.sa_flags = SA_SIGINFO | SA_NODEFER;
sys.sigaction(SIGSEGV, &sa, &sa_segv_);
// Unblock SIGSEGV and SIGCHLD
SysCalls::kernel_sigset_t mask;
memset(&mask, 0x00, sizeof(mask));
mask.sig[0] |= (1 << (SIGSEGV - 1)) | (1 << (SIGCHLD - 1));
sys.sigprocmask(SIG_UNBLOCK, &mask, 0);
}
long Sandbox::forwardSyscall(int sysnum, struct RequestHeader* request,
int size) {
SysCalls sys;
long rc;
request->sysnum = sysnum;
request->cookie = cookie();
if (write(sys, processFdPub(), request, size) != size) {
die("Failed to send forwarded request");
}
if (read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
die("Failed to receive forwarded result");
}
return rc;
}
SecureMem::Args* Sandbox::getSecureMem() {
// Check trusted_thread.cc for the magic offset that gets us from the TLS
// to the beginning of the secure memory area.
SecureMem::Args* ret;
#if defined(__x86_64__)
asm volatile(
"movq %%gs:-0xE0, %0\n"
: "=q"(ret));
#elif defined(__i386__)
asm volatile(
"movl %%fs:-0x58, %0\n"
: "=r"(ret));
#else
#error Unsupported target platform
#endif
return ret;
}
void Sandbox::snapshotMemoryMappings(int processFd, int proc_self_maps) {
SysCalls sys;
if (sys.lseek(proc_self_maps, 0, SEEK_SET) ||
!sendFd(processFd, proc_self_maps, -1, NULL, 0)) {
failure:
die("Cannot access /proc/self/maps");
}
int dummy;
if (read(sys, processFd, &dummy, sizeof(dummy)) != sizeof(dummy)) {
goto failure;
}
}
int Sandbox::supportsSeccompSandbox(int proc_fd) {
if (status_ != STATUS_UNKNOWN) {
return status_ != STATUS_UNSUPPORTED;
}
int fds[2];
SysCalls sys;
if (sys.pipe(fds)) {
status_ = STATUS_UNSUPPORTED;
return 0;
}
pid_t pid;
switch ((pid = sys.fork())) {
case -1:
status_ = STATUS_UNSUPPORTED;
return 0;
case 0: {
int devnull = sys.open("/dev/null", O_RDWR, 0);
if (devnull >= 0) {
sys.dup2(devnull, 0);
sys.dup2(devnull, 1);
sys.dup2(devnull, 2);
sys.close(devnull);
}
if (proc_fd >= 0) {
setProcSelfMaps(sys.openat(proc_fd, "self/maps", O_RDONLY, 0));
}
startSandbox();
write(sys, fds[1], "", 1);
// Try to tell the trusted thread to shut down the entire process in an
// orderly fashion
defaultSystemCallHandler(__NR_exit_group, 0, 0, 0, 0, 0, 0);
// If that did not work (e.g. because the kernel does not know about the
// exit_group() system call), make a direct _exit() system call instead.
// This system call is unrestricted in seccomp mode, so it will always
// succeed. Normally, we don't like it, because unlike exit_group() it
// does not terminate any other thread. But since we know that
// exit_group() exists in all kernels which support kernel-level threads,
// this is OK we only get here for old kernels where _exit() is OK.
sys._exit(0);
}
default:
(void)NOINTR_SYS(sys.close(fds[1]));
char ch;
if (read(sys, fds[0], &ch, 1) != 1) {
status_ = STATUS_UNSUPPORTED;
} else {
status_ = STATUS_AVAILABLE;
}
int rc;
(void)NOINTR_SYS(sys.waitpid(pid, &rc, 0));
(void)NOINTR_SYS(sys.close(fds[0]));
return status_ != STATUS_UNSUPPORTED;
}
}
void Sandbox::setProcSelfMaps(int proc_self_maps) {
proc_self_maps_ = proc_self_maps;
}
void Sandbox::startSandbox() {
if (status_ == STATUS_UNSUPPORTED) {
die("The seccomp sandbox is not supported on this computer");
} else if (status_ == STATUS_ENABLED) {
return;
}
SysCalls sys;
if (proc_self_maps_ < 0) {
proc_self_maps_ = sys.open("/proc/self/maps", O_RDONLY, 0);
if (proc_self_maps_ < 0) {
die("Cannot access \"/proc/self/maps\"");
}
}
// The pid is unchanged for the entire program, so we can retrieve it once
// and store it in a global variable.
pid_ = sys.getpid();
// Block all signals, except for the RDTSC handler
setupSignalHandlers();
// Set up the system call policy
SyscallTable::initializeSyscallTable();
// Get socketpairs for talking to the trusted process
int pair[4];
if (sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair) ||
sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair+2)) {
die("Failed to create trusted thread");
}
processFdPub_ = pair[0];
cloneFdPub_ = pair[2];
SecureMemArgs* secureMem = createTrustedProcess(pair[0], pair[1],
pair[2], pair[3]);
// We find all libraries that have system calls and redirect the system
// calls to the sandbox. If we miss any system calls, the application will be
// terminated by the kernel's seccomp code. So, from a security point of
// view, if this code fails to identify system calls, we are still behaving
// correctly.
{
Maps maps(proc_self_maps_);
const char *libs[] = { "ld", "libc", "librt", "libpthread", NULL };
// Intercept system calls in the VDSO segment (if any). This has to happen
// before intercepting system calls in any of the other libraries, as
// the main kernel entry point might be inside of the VDSO and we need to
// determine its address before we can compare it to jumps from inside
// other libraries.
for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){
Library* library = *iter;
if (library->isVDSO() && library->parseElf()) {
library->makeWritable(true);
library->patchSystemCalls();
library->makeWritable(false);
break;
}
}
// Intercept system calls in libraries that are known to have them.
for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){
Library* library = *iter;
const char* mapping = iter.name().c_str();
// Find the actual base name of the mapped library by skipping past any
// SPC and forward-slashes. We don't want to accidentally find matches,
// because the directory name included part of our well-known lib names.
//
// Typically, prior to pruning, entries would look something like this:
// 08:01 2289011 /lib/libc-2.7.so
for (const char *delim = " /"; *delim; ++delim) {
const char* skip = strrchr(mapping, *delim);
if (skip) {
mapping = skip + 1;
}
}
for (const char **ptr = libs; *ptr; ptr++) {
const char *name = strstr(mapping, *ptr);
if (name == mapping) {
char ch = name[strlen(*ptr)];
if (ch < 'A' || (ch > 'Z' && ch < 'a') || ch > 'z') {
if (library->parseElf()) {
library->makeWritable(true);
library->patchSystemCalls();
library->makeWritable(false);
break;
}
}
}
}
}
}
// Take a snapshot of the current memory mappings. These mappings will be
// off-limits to all future mmap(), munmap(), mremap(), and mprotect() calls.
// This also provides a synchronization point that ensures the trusted
// process has finished initialization.
snapshotMemoryMappings(processFdPub_, proc_self_maps_);
(void)NOINTR_SYS(sys.close(proc_self_maps_));
proc_self_maps_ = -1;
// Creating the trusted thread enables sandboxing
g_create_trusted_thread(secureMem);
// Force direct system calls to jump to our entry point.
struct {
// Instantiate another copy of linux_syscall_support.h. This time, we
// define SYS_SYSCALL_ENTRYPOINT. This gives us access to a
// get_syscall_entrypoint() function that we can use to install a pointer
// to our system call entrypoint handler.
// Any user of linux_syscall_support.h who wants to make sure that the
// sandbox properly redirects its system calls would define the same
// macro.
#undef SYS_ERRNO
#define SYS_INLINE inline
#define SYS_PREFIX -1
#define SYS_SYSCALL_ENTRYPOINT "playground$syscallEntryPoint"
#undef SYS_LINUX_SYSCALL_SUPPORT_H
#include "linux_syscall_support.h"
} entrypoint;
*entrypoint.get_syscall_entrypoint() = syscallEntryPointNoFrame;
// We can no longer check for sandboxing support at this point, but we also
// know for a fact that it is available (as we just turned it on). So update
// the status to reflect this information.
status_ = STATUS_ENABLED;
}
} // namespace