blob: f7c85113db789ec1676bd967034a83f372413e45 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <time.h>
#include "sandbox/linux/seccomp-bpf/sandbox_bpf.h"
#include "sandbox/linux/seccomp-bpf/verifier.h"
// The kernel gives us a sandbox, we turn it into a playground :-)
// This is version 2 of the playground; version 1 was built on top of
// pre-BPF seccomp mode.
namespace playground2 {
// We define a really simple sandbox policy. It is just good enough for us
// to tell that the sandbox has actually been activated.
ErrorCode Sandbox::probeEvaluator(int signo) {
switch (signo) {
case __NR_getpid:
// Return EPERM so that we can check that the filter actually ran.
return ErrorCode(EPERM);
case __NR_exit_group:
// Allow exit() with a non-default return code.
return ErrorCode(ErrorCode::ERR_ALLOWED);
default:
// Make everything else fail in an easily recognizable way.
return ErrorCode(EINVAL);
}
}
void Sandbox::probeProcess(void) {
if (syscall(__NR_getpid) < 0 && errno == EPERM) {
syscall(__NR_exit_group, (intptr_t)100);
}
}
ErrorCode Sandbox::allowAllEvaluator(int signo) {
if (signo < static_cast<int>(MIN_SYSCALL) ||
signo > static_cast<int>(MAX_SYSCALL)) {
return ErrorCode(ENOSYS);
}
return ErrorCode(ErrorCode::ERR_ALLOWED);
}
void Sandbox::tryVsyscallProcess(void) {
time_t current_time;
// time() is implemented as a vsyscall. With an older glibc, with
// vsyscall=emulate and some versions of the seccomp BPF patch
// we may get SIGKILL-ed. Detect this!
if (time(&current_time) != static_cast<time_t>(-1)) {
syscall(__NR_exit_group, (intptr_t)100);
}
}
bool Sandbox::RunFunctionInPolicy(void (*CodeInSandbox)(),
EvaluateSyscall syscallEvaluator,
int proc_fd) {
// Block all signals before forking a child process. This prevents an
// attacker from manipulating our test by sending us an unexpected signal.
sigset_t oldMask, newMask;
if (sigfillset(&newMask) ||
sigprocmask(SIG_BLOCK, &newMask, &oldMask)) {
SANDBOX_DIE("sigprocmask() failed");
}
int fds[2];
if (pipe2(fds, O_NONBLOCK|O_CLOEXEC)) {
SANDBOX_DIE("pipe() failed");
}
pid_t pid = fork();
if (pid < 0) {
// Die if we cannot fork(). We would probably fail a little later
// anyway, as the machine is likely very close to running out of
// memory.
// But what we don't want to do is return "false", as a crafty
// attacker might cause fork() to fail at will and could trick us
// into running without a sandbox.
sigprocmask(SIG_SETMASK, &oldMask, NULL); // OK, if it fails
SANDBOX_DIE("fork() failed unexpectedly");
}
// In the child process
if (!pid) {
// Test a very simple sandbox policy to verify that we can
// successfully turn on sandboxing.
Die::EnableSimpleExit();
if (HANDLE_EINTR(close(fds[0])) ||
dup2(fds[1], 2) != 2 ||
HANDLE_EINTR(close(fds[1]))) {
static const char msg[] = "Failed to set up stderr\n";
if (HANDLE_EINTR(write(fds[1], msg, sizeof(msg)-1))) { }
} else {
evaluators_.clear();
setSandboxPolicy(syscallEvaluator, NULL);
setProcFd(proc_fd);
// By passing "quiet=true" to "startSandboxInternal()" we suppress
// messages for expected and benign failures (e.g. if the current
// kernel lacks support for BPF filters).
startSandboxInternal(true);
// Run our code in the sandbox
CodeInSandbox();
}
SANDBOX_DIE(NULL);
}
// In the parent process.
if (HANDLE_EINTR(close(fds[1]))) {
SANDBOX_DIE("close() failed");
}
if (sigprocmask(SIG_SETMASK, &oldMask, NULL)) {
SANDBOX_DIE("sigprocmask() failed");
}
int status;
if (HANDLE_EINTR(waitpid(pid, &status, 0)) != pid) {
SANDBOX_DIE("waitpid() failed unexpectedly");
}
bool rc = WIFEXITED(status) && WEXITSTATUS(status) == 100;
// If we fail to support sandboxing, there might be an additional
// error message. If so, this was an entirely unexpected and fatal
// failure. We should report the failure and somebody must fix
// things. This is probably a security-critical bug in the sandboxing
// code.
if (!rc) {
char buf[4096];
ssize_t len = HANDLE_EINTR(read(fds[0], buf, sizeof(buf) - 1));
if (len > 0) {
while (len > 1 && buf[len-1] == '\n') {
--len;
}
buf[len] = '\000';
SANDBOX_DIE(buf);
}
}
if (HANDLE_EINTR(close(fds[0]))) {
SANDBOX_DIE("close() failed");
}
return rc;
}
bool Sandbox::kernelSupportSeccompBPF(int proc_fd) {
#if defined(SECCOMP_BPF_VALGRIND_HACKS)
if (RUNNING_ON_VALGRIND) {
// Valgrind doesn't like our run-time test. Disable testing and assume we
// always support sandboxing. This feature should only ever be enabled when
// debugging.
return true;
}
#endif
return RunFunctionInPolicy(probeProcess, Sandbox::probeEvaluator, proc_fd) &&
RunFunctionInPolicy(tryVsyscallProcess, Sandbox::allowAllEvaluator,
proc_fd);
}
Sandbox::SandboxStatus Sandbox::supportsSeccompSandbox(int proc_fd) {
// It the sandbox is currently active, we clearly must have support for
// sandboxing.
if (status_ == STATUS_ENABLED) {
return status_;
}
// Even if the sandbox was previously available, something might have
// changed in our run-time environment. Check one more time.
if (status_ == STATUS_AVAILABLE) {
if (!isSingleThreaded(proc_fd)) {
status_ = STATUS_UNAVAILABLE;
}
return status_;
}
if (status_ == STATUS_UNAVAILABLE && isSingleThreaded(proc_fd)) {
// All state transitions resulting in STATUS_UNAVAILABLE are immediately
// preceded by STATUS_AVAILABLE. Furthermore, these transitions all
// happen, if and only if they are triggered by the process being multi-
// threaded.
// In other words, if a single-threaded process is currently in the
// STATUS_UNAVAILABLE state, it is safe to assume that sandboxing is
// actually available.
status_ = STATUS_AVAILABLE;
return status_;
}
// If we have not previously checked for availability of the sandbox or if
// we otherwise don't believe to have a good cached value, we have to
// perform a thorough check now.
if (status_ == STATUS_UNKNOWN) {
status_ = kernelSupportSeccompBPF(proc_fd)
? STATUS_AVAILABLE : STATUS_UNSUPPORTED;
// As we are performing our tests from a child process, the run-time
// environment that is visible to the sandbox is always guaranteed to be
// single-threaded. Let's check here whether the caller is single-
// threaded. Otherwise, we mark the sandbox as temporarily unavailable.
if (status_ == STATUS_AVAILABLE && !isSingleThreaded(proc_fd)) {
status_ = STATUS_UNAVAILABLE;
}
}
return status_;
}
void Sandbox::setProcFd(int proc_fd) {
proc_fd_ = proc_fd;
}
void Sandbox::startSandboxInternal(bool quiet) {
if (status_ == STATUS_UNSUPPORTED || status_ == STATUS_UNAVAILABLE) {
SANDBOX_DIE("Trying to start sandbox, even though it is known to be "
"unavailable");
} else if (status_ == STATUS_ENABLED) {
SANDBOX_DIE("Cannot start sandbox recursively. Use multiple calls to "
"setSandboxPolicy() to stack policies instead");
}
if (proc_fd_ < 0) {
proc_fd_ = open("/proc", O_RDONLY|O_DIRECTORY);
}
if (proc_fd_ < 0) {
// For now, continue in degraded mode, if we can't access /proc.
// In the future, we might want to tighten this requirement.
}
if (!isSingleThreaded(proc_fd_)) {
SANDBOX_DIE("Cannot start sandbox, if process is already multi-threaded");
}
// We no longer need access to any files in /proc. We want to do this
// before installing the filters, just in case that our policy denies
// close().
if (proc_fd_ >= 0) {
if (HANDLE_EINTR(close(proc_fd_))) {
SANDBOX_DIE("Failed to close file descriptor for /proc");
}
proc_fd_ = -1;
}
// Install the filters.
installFilter(quiet);
// We are now inside the sandbox.
status_ = STATUS_ENABLED;
}
bool Sandbox::isSingleThreaded(int proc_fd) {
if (proc_fd < 0) {
// Cannot determine whether program is single-threaded. Hope for
// the best...
return true;
}
struct stat sb;
int task = -1;
if ((task = openat(proc_fd, "self/task", O_RDONLY|O_DIRECTORY)) < 0 ||
fstat(task, &sb) != 0 ||
sb.st_nlink != 3 ||
HANDLE_EINTR(close(task))) {
if (task >= 0) {
if (HANDLE_EINTR(close(task))) { }
}
return false;
}
return true;
}
bool Sandbox::isDenied(const ErrorCode& code) {
return (code.err() & SECCOMP_RET_ACTION) == SECCOMP_RET_TRAP ||
(code.err() >= (SECCOMP_RET_ERRNO + ErrorCode::ERR_MIN_ERRNO) &&
code.err() <= (SECCOMP_RET_ERRNO + ErrorCode::ERR_MAX_ERRNO));
}
void Sandbox::policySanityChecks(EvaluateSyscall syscallEvaluator,
EvaluateArguments) {
// Do some sanity checks on the policy. This will warn users if they do
// things that are likely unsafe and unintended.
// We also have similar checks later, when we actually compile the BPF
// program. That catches problems with incorrectly stacked evaluators.
if (!isDenied(syscallEvaluator(-1))) {
SANDBOX_DIE("Negative system calls should always be disallowed by policy");
}
#ifndef NDEBUG
#if defined(__i386__) || defined(__x86_64__)
#if defined(__x86_64__) && defined(__ILP32__)
for (unsigned int sysnum = MIN_SYSCALL & ~0x40000000u;
sysnum <= (MAX_SYSCALL & ~0x40000000u);
++sysnum) {
if (!isDenied(syscallEvaluator(sysnum))) {
SANDBOX_DIE("In x32 mode, you should not allow any non-x32 "
"system calls");
}
}
#else
for (unsigned int sysnum = MIN_SYSCALL | 0x40000000u;
sysnum <= (MAX_SYSCALL | 0x40000000u);
++sysnum) {
if (!isDenied(syscallEvaluator(sysnum))) {
SANDBOX_DIE("x32 system calls should be explicitly disallowed");
}
}
#endif
#endif
#endif
// Check interesting boundary values just outside of the valid system call
// range: 0x7FFFFFFF, 0x80000000, 0xFFFFFFFF, MIN_SYSCALL-1, MAX_SYSCALL+1.
// They all should be denied.
if (!isDenied(syscallEvaluator(std::numeric_limits<int>::max())) ||
!isDenied(syscallEvaluator(std::numeric_limits<int>::min())) ||
!isDenied(syscallEvaluator(-1)) ||
!isDenied(syscallEvaluator(static_cast<int>(MIN_SYSCALL) - 1)) ||
!isDenied(syscallEvaluator(static_cast<int>(MAX_SYSCALL) + 1))) {
SANDBOX_DIE("Even for default-allow policies, you must never allow system "
"calls outside of the standard system call range");
}
return;
}
void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator,
EvaluateArguments argumentEvaluator) {
if (status_ == STATUS_ENABLED) {
SANDBOX_DIE("Cannot change policy after sandbox has started");
}
policySanityChecks(syscallEvaluator, argumentEvaluator);
evaluators_.push_back(std::make_pair(syscallEvaluator, argumentEvaluator));
}
void Sandbox::installFilter(bool quiet) {
// Verify that the user pushed a policy.
if (evaluators_.empty()) {
filter_failed:
SANDBOX_DIE("Failed to configure system call filters");
}
// Set new SIGSYS handler
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = &sigSys;
sa.sa_flags = SA_SIGINFO;
if (sigaction(SIGSYS, &sa, NULL) < 0) {
goto filter_failed;
}
// Unmask SIGSYS
sigset_t mask;
if (sigemptyset(&mask) ||
sigaddset(&mask, SIGSYS) ||
sigprocmask(SIG_UNBLOCK, &mask, NULL)) {
goto filter_failed;
}
// We can't handle stacked evaluators, yet. We'll get there eventually
// though. Hang tight.
if (evaluators_.size() != 1) {
SANDBOX_DIE("Not implemented");
}
// Assemble the BPF filter program.
Program *program = new Program();
if (!program) {
SANDBOX_DIE("Out of memory");
}
// If the architecture doesn't match SECCOMP_ARCH, disallow the
// system call.
program->push_back((struct sock_filter)
BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, arch)));
program->push_back((struct sock_filter)
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 1, 0));
program->push_back((struct sock_filter)
BPF_STMT(BPF_RET+BPF_K,
Kill("Invalid audit architecture in BPF filter").err()));
// Grab the system call number, so that we can implement jump tables.
program->push_back((struct sock_filter)
BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, nr)));
// On Intel architectures, verify that system call numbers are in the
// expected number range. The older i386 and x86-64 APIs clear bit 30
// on all system calls. The newer x86-32 API always sets bit 30.
#if defined(__i386__) || defined(__x86_64__)
#if defined(__x86_64__) && defined(__ILP32__)
program->push_back((struct sock_filter)
BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 1, 0));
#else
program->push_back((struct sock_filter)
BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, 1));
#endif
program->push_back((struct sock_filter)
BPF_STMT(BPF_RET+BPF_K,
Kill("Illegal mixing of system call ABIs").err()));
#endif
{
// Evaluate all possible system calls and group their ErrorCodes into
// ranges of identical codes.
Ranges ranges;
findRanges(&ranges);
// Compile the system call ranges to an optimized BPF program
RetInsns rets;
emitJumpStatements(program, &rets, ranges.begin(), ranges.end());
emitReturnStatements(program, rets);
}
// Make sure compilation resulted in BPF program that executes
// correctly. Otherwise, there is an internal error in our BPF compiler.
// There is really nothing the caller can do until the bug is fixed.
#ifndef NDEBUG
const char *err = NULL;
if (!Verifier::verifyBPF(*program, evaluators_, &err)) {
SANDBOX_DIE(err);
}
#endif
// We want to be very careful in not imposing any requirements on the
// policies that are set with setSandboxPolicy(). This means, as soon as
// the sandbox is active, we shouldn't be relying on libraries that could
// be making system calls. This, for example, means we should avoid
// using the heap and we should avoid using STL functions.
// Temporarily copy the contents of the "program" vector into a
// stack-allocated array; and then explicitly destroy that object.
// This makes sure we don't ex- or implicitly call new/delete after we
// installed the BPF filter program in the kernel. Depending on the
// system memory allocator that is in effect, these operators can result
// in system calls to things like munmap() or brk().
struct sock_filter bpf[program->size()];
const struct sock_fprog prog = {
static_cast<unsigned short>(program->size()), bpf };
memcpy(bpf, &(*program)[0], sizeof(bpf));
delete program;
// Release memory that is no longer needed
evaluators_.clear();
errMap_.clear();
#if defined(SECCOMP_BPF_VALGRIND_HACKS)
// Valgrind is really not happy about our sandbox. Disable it when running
// in Valgrind. This feature is dangerous and should never be enabled by
// default. We protect it behind a pre-processor option.
if (!RUNNING_ON_VALGRIND)
#endif
{
// Install BPF filter program
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
SANDBOX_DIE(quiet ? NULL : "Kernel refuses to enable no-new-privs");
} else {
if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
SANDBOX_DIE(quiet ? NULL : "Kernel refuses to turn on BPF filters");
}
}
}
return;
}
void Sandbox::findRanges(Ranges *ranges) {
// Please note that "struct seccomp_data" defines system calls as a signed
// int32_t, but BPF instructions always operate on unsigned quantities. We
// deal with this disparity by enumerating from MIN_SYSCALL to MAX_SYSCALL,
// and then verifying that the rest of the number range (both positive and
// negative) all return the same ErrorCode.
EvaluateSyscall evaluateSyscall = evaluators_.begin()->first;
uint32_t oldSysnum = 0;
ErrorCode oldErr = evaluateSyscall(oldSysnum);
for (uint32_t sysnum = std::max(1u, MIN_SYSCALL);
sysnum <= MAX_SYSCALL + 1;
++sysnum) {
ErrorCode err = evaluateSyscall(static_cast<int>(sysnum));
if (!err.Equals(oldErr)) {
ranges->push_back(Range(oldSysnum, sysnum-1, oldErr));
oldSysnum = sysnum;
oldErr = err;
}
}
// As we looped all the way past the valid system calls (i.e. MAX_SYSCALL+1),
// "oldErr" should at this point be the "default" policy for all system call
// numbers that don't have an explicit handler in the system call evaluator.
// But as we are quite paranoid, we perform some more sanity checks to verify
// that there actually is a consistent "default" policy in the first place.
// We don't actually iterate over all possible 2^32 values, though. We just
// perform spot checks at the boundaries.
// The cases that we test are: 0x7FFFFFFF, 0x80000000, 0xFFFFFFFF.
if (!oldErr.Equals(evaluateSyscall(std::numeric_limits<int>::max())) ||
!oldErr.Equals(evaluateSyscall(std::numeric_limits<int>::min())) ||
!oldErr.Equals(evaluateSyscall(-1))) {
SANDBOX_DIE("Invalid seccomp policy");
}
ranges->push_back(
Range(oldSysnum, std::numeric_limits<unsigned>::max(), oldErr));
}
void Sandbox::emitJumpStatements(Program *program, RetInsns *rets,
Ranges::const_iterator start,
Ranges::const_iterator stop) {
// We convert the list of system call ranges into jump table that performs
// a binary search over the ranges.
// As a sanity check, we need to have at least two distinct ranges for us
// to be able to build a jump table.
if (stop - start <= 1) {
SANDBOX_DIE("Invalid set of system call ranges");
}
// Pick the range object that is located at the mid point of our list.
// We compare our system call number against the lowest valid system call
// number in this range object. If our number is lower, it is outside of
// this range object. If it is greater or equal, it might be inside.
Ranges::const_iterator mid = start + (stop - start)/2;
Program::size_type jmp = program->size();
if (jmp >= SECCOMP_MAX_PROGRAM_SIZE) {
compiler_err:
SANDBOX_DIE("Internal compiler error; failed to compile jump table");
}
program->push_back((struct sock_filter)
BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, mid->from,
// Jump targets are place-holders that will be fixed up later.
0, 0));
// The comparison turned out to be false; i.e. our system call number is
// less than the range object at the mid point of the list.
if (mid - start == 1) {
// If we have narrowed things down to a single range object, we can
// return from the BPF filter program.
// Instead of emitting a BPF_RET statement, we want to coalesce all
// identical BPF_RET statements into a single instance. This results in
// a more efficient BPF program that uses less CPU cache.
// Since all branches in BPF programs have to be forward branches, we
// keep track of our current instruction pointer and then fix up the
// branch when we emit the BPF_RET statement in emitReturnStatements().
(*rets)[start->err.err()].push_back(FixUp(jmp, false));
} else {
// Sub-divide the list of ranges and continue recursively.
emitJumpStatements(program, rets, start, mid);
}
// The comparison turned out to be true; i.e. our system call number is
// greater or equal to the range object at the mid point of the list.
if (stop - mid == 1) {
// We narrowed things down to a single range object. Remember instruction
// pointer and exit code, so that we can patch up the target of the jump
// instruction in emitReturnStatements().
(*rets)[mid->err.err()].push_back(FixUp(jmp, true));
} else {
// We now know where the block of instructions for the "true" comparison
// starts. Patch up the jump target of the BPF_JMP instruction that we
// emitted earlier.
int distance = program->size() - jmp - 1;
if (distance < 0 || distance > 255) {
goto compiler_err;
}
(*program)[jmp].jt = distance;
// Sub-divide the list of ranges and continue recursively.
emitJumpStatements(program, rets, mid, stop);
}
}
void Sandbox::emitReturnStatements(Program *program, const RetInsns& rets) {
// Iterate over the list of distinct exit codes from our BPF filter
// program and emit the BPF_RET statements.
for (RetInsns::const_iterator ret_iter = rets.begin();
ret_iter != rets.end();
++ret_iter) {
Program::size_type ip = program->size();
if (ip >= SECCOMP_MAX_PROGRAM_SIZE) {
SANDBOX_DIE("Internal compiler error; failed to compile jump table");
}
program->push_back((struct sock_filter)
BPF_STMT(BPF_RET+BPF_K, ret_iter->first));
// Iterate over the instruction pointers for the BPF_JMP instructions
// that need to be patched up.
for (std::vector<FixUp>::const_iterator insn_iter=ret_iter->second.begin();
insn_iter != ret_iter->second.end();
++insn_iter) {
// Jumps are always relative and they are always forward.
int distance = ip - insn_iter->addr - 1;
if (distance < 0 || distance > 255) {
SANDBOX_DIE("Internal compiler error; failed to compile jump table");
}
// Decide whether we need to patch up the "true" or the "false" jump
// target.
if (insn_iter->jt) {
(*program)[insn_iter->addr].jt = distance;
} else {
(*program)[insn_iter->addr].jf = distance;
}
}
}
}
void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) {
// Various sanity checks to make sure we actually received a signal
// triggered by a BPF filter. If something else triggered SIGSYS
// (e.g. kill()), there is really nothing we can do with this signal.
if (nr != SIGSYS || info->si_code != SYS_SECCOMP || !void_context ||
info->si_errno <= 0 ||
static_cast<size_t>(info->si_errno) > trapArraySize_) {
// SANDBOX_DIE() can call LOG(FATAL). This is not normally async-signal
// safe and can lead to bugs. We should eventually implement a different
// logging and reporting mechanism that is safe to be called from
// the sigSys() handler.
// TODO: If we feel confident that our code otherwise works correctly, we
// could actually make an argument that spurious SIGSYS should
// just get silently ignored. TBD
sigsys_err:
SANDBOX_DIE("Unexpected SIGSYS received");
}
// Signal handlers should always preserve "errno". Otherwise, we could
// trigger really subtle bugs.
int old_errno = errno;
// Obtain the signal context. This, most notably, gives us access to
// all CPU registers at the time of the signal.
ucontext_t *ctx = reinterpret_cast<ucontext_t *>(void_context);
// Obtain the siginfo information that is specific to SIGSYS. Unfortunately,
// most versions of glibc don't include this information in siginfo_t. So,
// we need to explicitly copy it into a arch_sigsys structure.
struct arch_sigsys sigsys;
memcpy(&sigsys, &info->_sifields, sizeof(sigsys));
// Some more sanity checks.
if (sigsys.ip != reinterpret_cast<void *>(SECCOMP_IP(ctx)) ||
sigsys.nr != static_cast<int>(SECCOMP_SYSCALL(ctx)) ||
sigsys.arch != SECCOMP_ARCH) {
goto sigsys_err;
}
// Copy the seccomp-specific data into a arch_seccomp_data structure. This
// is what we are showing to TrapFnc callbacks that the system call evaluator
// registered with the sandbox.
struct arch_seccomp_data data = {
sigsys.nr,
SECCOMP_ARCH,
reinterpret_cast<uint64_t>(sigsys.ip),
{
static_cast<uint64_t>(SECCOMP_PARM1(ctx)),
static_cast<uint64_t>(SECCOMP_PARM2(ctx)),
static_cast<uint64_t>(SECCOMP_PARM3(ctx)),
static_cast<uint64_t>(SECCOMP_PARM4(ctx)),
static_cast<uint64_t>(SECCOMP_PARM5(ctx)),
static_cast<uint64_t>(SECCOMP_PARM6(ctx))
}
};
// Now call the TrapFnc callback associated with this particular instance
// of SECCOMP_RET_TRAP.
const ErrorCode& err = trapArray_[info->si_errno - 1];
intptr_t rc = err.fnc_(data, err.aux_);
// Update the CPU register that stores the return code of the system call
// that we just handled, and restore "errno" to the value that it had
// before entering the signal handler.
SECCOMP_RESULT(ctx) = static_cast<greg_t>(rc);
errno = old_errno;
return;
}
ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {
// Each unique pair of TrapFnc and auxiliary data make up a distinct instance
// of a SECCOMP_RET_TRAP.
std::pair<ErrorCode::TrapFnc, const void *> key(fnc, aux);
TrapIds::const_iterator iter = trapIds_.find(key);
uint16_t id;
if (iter != trapIds_.end()) {
// We have seen this pair before. Return the same id that we assigned
// earlier.
id = iter->second;
} else {
// This is a new pair. Remember it and assign a new id.
// Please note that we have to store traps in memory that doesn't get
// deallocated when the program is shutting down. A memory leak is
// intentional, because we might otherwise not be able to execute
// system calls part way through the program shutting down
if (!traps_) {
traps_ = new Traps();
}
if (traps_->size() >= SECCOMP_RET_DATA) {
// In practice, this is pretty much impossible to trigger, as there
// are other kernel limitations that restrict overall BPF program sizes.
SANDBOX_DIE("Too many SECCOMP_RET_TRAP callback instances");
}
id = traps_->size() + 1;
traps_->push_back(ErrorCode(fnc, aux, id));
trapIds_[key] = id;
// We want to access the traps_ vector from our signal handler. But
// we are not assured that doing so is async-signal safe. On the other
// hand, C++ guarantees that the contents of a vector is stored in a
// contiguous C-style array.
// So, we look up the address and size of this array outside of the
// signal handler, where we can safely do so.
trapArray_ = &(*traps_)[0];
trapArraySize_ = id;
}
ErrorCode err = ErrorCode(fnc, aux, id);
return errMap_[err.err()] = err;
}
intptr_t Sandbox::bpfFailure(const struct arch_seccomp_data&, void *aux) {
SANDBOX_DIE(static_cast<char *>(aux));
}
ErrorCode Sandbox::Kill(const char *msg) {
return Trap(bpfFailure, const_cast<char *>(msg));
}
Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;
int Sandbox::proc_fd_ = -1;
Sandbox::Evaluators Sandbox::evaluators_;
Sandbox::ErrMap Sandbox::errMap_;
Sandbox::Traps *Sandbox::traps_ = NULL;
Sandbox::TrapIds Sandbox::trapIds_;
ErrorCode *Sandbox::trapArray_ = NULL;
size_t Sandbox::trapArraySize_ = 0;
} // namespace