| // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include <time.h> |
| |
| #include "sandbox/linux/seccomp-bpf/sandbox_bpf.h" |
| #include "sandbox/linux/seccomp-bpf/verifier.h" |
| |
| // The kernel gives us a sandbox, we turn it into a playground :-) |
| // This is version 2 of the playground; version 1 was built on top of |
| // pre-BPF seccomp mode. |
| namespace playground2 { |
| |
| // We define a really simple sandbox policy. It is just good enough for us |
| // to tell that the sandbox has actually been activated. |
| ErrorCode Sandbox::probeEvaluator(int signo) { |
| switch (signo) { |
| case __NR_getpid: |
| // Return EPERM so that we can check that the filter actually ran. |
| return ErrorCode(EPERM); |
| case __NR_exit_group: |
| // Allow exit() with a non-default return code. |
| return ErrorCode(ErrorCode::ERR_ALLOWED); |
| default: |
| // Make everything else fail in an easily recognizable way. |
| return ErrorCode(EINVAL); |
| } |
| } |
| |
| void Sandbox::probeProcess(void) { |
| if (syscall(__NR_getpid) < 0 && errno == EPERM) { |
| syscall(__NR_exit_group, (intptr_t)100); |
| } |
| } |
| |
| ErrorCode Sandbox::allowAllEvaluator(int signo) { |
| if (signo < static_cast<int>(MIN_SYSCALL) || |
| signo > static_cast<int>(MAX_SYSCALL)) { |
| return ErrorCode(ENOSYS); |
| } |
| return ErrorCode(ErrorCode::ERR_ALLOWED); |
| } |
| |
| void Sandbox::tryVsyscallProcess(void) { |
| time_t current_time; |
| // time() is implemented as a vsyscall. With an older glibc, with |
| // vsyscall=emulate and some versions of the seccomp BPF patch |
| // we may get SIGKILL-ed. Detect this! |
| if (time(¤t_time) != static_cast<time_t>(-1)) { |
| syscall(__NR_exit_group, (intptr_t)100); |
| } |
| } |
| |
| bool Sandbox::RunFunctionInPolicy(void (*CodeInSandbox)(), |
| EvaluateSyscall syscallEvaluator, |
| int proc_fd) { |
| // Block all signals before forking a child process. This prevents an |
| // attacker from manipulating our test by sending us an unexpected signal. |
| sigset_t oldMask, newMask; |
| if (sigfillset(&newMask) || |
| sigprocmask(SIG_BLOCK, &newMask, &oldMask)) { |
| SANDBOX_DIE("sigprocmask() failed"); |
| } |
| int fds[2]; |
| if (pipe2(fds, O_NONBLOCK|O_CLOEXEC)) { |
| SANDBOX_DIE("pipe() failed"); |
| } |
| |
| pid_t pid = fork(); |
| if (pid < 0) { |
| // Die if we cannot fork(). We would probably fail a little later |
| // anyway, as the machine is likely very close to running out of |
| // memory. |
| // But what we don't want to do is return "false", as a crafty |
| // attacker might cause fork() to fail at will and could trick us |
| // into running without a sandbox. |
| sigprocmask(SIG_SETMASK, &oldMask, NULL); // OK, if it fails |
| SANDBOX_DIE("fork() failed unexpectedly"); |
| } |
| |
| // In the child process |
| if (!pid) { |
| // Test a very simple sandbox policy to verify that we can |
| // successfully turn on sandboxing. |
| Die::EnableSimpleExit(); |
| if (HANDLE_EINTR(close(fds[0])) || |
| dup2(fds[1], 2) != 2 || |
| HANDLE_EINTR(close(fds[1]))) { |
| static const char msg[] = "Failed to set up stderr\n"; |
| if (HANDLE_EINTR(write(fds[1], msg, sizeof(msg)-1))) { } |
| } else { |
| evaluators_.clear(); |
| setSandboxPolicy(syscallEvaluator, NULL); |
| setProcFd(proc_fd); |
| |
| // By passing "quiet=true" to "startSandboxInternal()" we suppress |
| // messages for expected and benign failures (e.g. if the current |
| // kernel lacks support for BPF filters). |
| startSandboxInternal(true); |
| |
| // Run our code in the sandbox |
| CodeInSandbox(); |
| } |
| SANDBOX_DIE(NULL); |
| } |
| |
| // In the parent process. |
| if (HANDLE_EINTR(close(fds[1]))) { |
| SANDBOX_DIE("close() failed"); |
| } |
| if (sigprocmask(SIG_SETMASK, &oldMask, NULL)) { |
| SANDBOX_DIE("sigprocmask() failed"); |
| } |
| int status; |
| if (HANDLE_EINTR(waitpid(pid, &status, 0)) != pid) { |
| SANDBOX_DIE("waitpid() failed unexpectedly"); |
| } |
| bool rc = WIFEXITED(status) && WEXITSTATUS(status) == 100; |
| |
| // If we fail to support sandboxing, there might be an additional |
| // error message. If so, this was an entirely unexpected and fatal |
| // failure. We should report the failure and somebody must fix |
| // things. This is probably a security-critical bug in the sandboxing |
| // code. |
| if (!rc) { |
| char buf[4096]; |
| ssize_t len = HANDLE_EINTR(read(fds[0], buf, sizeof(buf) - 1)); |
| if (len > 0) { |
| while (len > 1 && buf[len-1] == '\n') { |
| --len; |
| } |
| buf[len] = '\000'; |
| SANDBOX_DIE(buf); |
| } |
| } |
| if (HANDLE_EINTR(close(fds[0]))) { |
| SANDBOX_DIE("close() failed"); |
| } |
| |
| return rc; |
| |
| } |
| |
| bool Sandbox::kernelSupportSeccompBPF(int proc_fd) { |
| #if defined(SECCOMP_BPF_VALGRIND_HACKS) |
| if (RUNNING_ON_VALGRIND) { |
| // Valgrind doesn't like our run-time test. Disable testing and assume we |
| // always support sandboxing. This feature should only ever be enabled when |
| // debugging. |
| return true; |
| } |
| #endif |
| |
| return RunFunctionInPolicy(probeProcess, Sandbox::probeEvaluator, proc_fd) && |
| RunFunctionInPolicy(tryVsyscallProcess, Sandbox::allowAllEvaluator, |
| proc_fd); |
| } |
| |
| Sandbox::SandboxStatus Sandbox::supportsSeccompSandbox(int proc_fd) { |
| // It the sandbox is currently active, we clearly must have support for |
| // sandboxing. |
| if (status_ == STATUS_ENABLED) { |
| return status_; |
| } |
| |
| // Even if the sandbox was previously available, something might have |
| // changed in our run-time environment. Check one more time. |
| if (status_ == STATUS_AVAILABLE) { |
| if (!isSingleThreaded(proc_fd)) { |
| status_ = STATUS_UNAVAILABLE; |
| } |
| return status_; |
| } |
| |
| if (status_ == STATUS_UNAVAILABLE && isSingleThreaded(proc_fd)) { |
| // All state transitions resulting in STATUS_UNAVAILABLE are immediately |
| // preceded by STATUS_AVAILABLE. Furthermore, these transitions all |
| // happen, if and only if they are triggered by the process being multi- |
| // threaded. |
| // In other words, if a single-threaded process is currently in the |
| // STATUS_UNAVAILABLE state, it is safe to assume that sandboxing is |
| // actually available. |
| status_ = STATUS_AVAILABLE; |
| return status_; |
| } |
| |
| // If we have not previously checked for availability of the sandbox or if |
| // we otherwise don't believe to have a good cached value, we have to |
| // perform a thorough check now. |
| if (status_ == STATUS_UNKNOWN) { |
| status_ = kernelSupportSeccompBPF(proc_fd) |
| ? STATUS_AVAILABLE : STATUS_UNSUPPORTED; |
| |
| // As we are performing our tests from a child process, the run-time |
| // environment that is visible to the sandbox is always guaranteed to be |
| // single-threaded. Let's check here whether the caller is single- |
| // threaded. Otherwise, we mark the sandbox as temporarily unavailable. |
| if (status_ == STATUS_AVAILABLE && !isSingleThreaded(proc_fd)) { |
| status_ = STATUS_UNAVAILABLE; |
| } |
| } |
| return status_; |
| } |
| |
| void Sandbox::setProcFd(int proc_fd) { |
| proc_fd_ = proc_fd; |
| } |
| |
| void Sandbox::startSandboxInternal(bool quiet) { |
| if (status_ == STATUS_UNSUPPORTED || status_ == STATUS_UNAVAILABLE) { |
| SANDBOX_DIE("Trying to start sandbox, even though it is known to be " |
| "unavailable"); |
| } else if (status_ == STATUS_ENABLED) { |
| SANDBOX_DIE("Cannot start sandbox recursively. Use multiple calls to " |
| "setSandboxPolicy() to stack policies instead"); |
| } |
| if (proc_fd_ < 0) { |
| proc_fd_ = open("/proc", O_RDONLY|O_DIRECTORY); |
| } |
| if (proc_fd_ < 0) { |
| // For now, continue in degraded mode, if we can't access /proc. |
| // In the future, we might want to tighten this requirement. |
| } |
| if (!isSingleThreaded(proc_fd_)) { |
| SANDBOX_DIE("Cannot start sandbox, if process is already multi-threaded"); |
| } |
| |
| // We no longer need access to any files in /proc. We want to do this |
| // before installing the filters, just in case that our policy denies |
| // close(). |
| if (proc_fd_ >= 0) { |
| if (HANDLE_EINTR(close(proc_fd_))) { |
| SANDBOX_DIE("Failed to close file descriptor for /proc"); |
| } |
| proc_fd_ = -1; |
| } |
| |
| // Install the filters. |
| installFilter(quiet); |
| |
| // We are now inside the sandbox. |
| status_ = STATUS_ENABLED; |
| } |
| |
| bool Sandbox::isSingleThreaded(int proc_fd) { |
| if (proc_fd < 0) { |
| // Cannot determine whether program is single-threaded. Hope for |
| // the best... |
| return true; |
| } |
| |
| struct stat sb; |
| int task = -1; |
| if ((task = openat(proc_fd, "self/task", O_RDONLY|O_DIRECTORY)) < 0 || |
| fstat(task, &sb) != 0 || |
| sb.st_nlink != 3 || |
| HANDLE_EINTR(close(task))) { |
| if (task >= 0) { |
| if (HANDLE_EINTR(close(task))) { } |
| } |
| return false; |
| } |
| return true; |
| } |
| |
| bool Sandbox::isDenied(const ErrorCode& code) { |
| return (code.err() & SECCOMP_RET_ACTION) == SECCOMP_RET_TRAP || |
| (code.err() >= (SECCOMP_RET_ERRNO + ErrorCode::ERR_MIN_ERRNO) && |
| code.err() <= (SECCOMP_RET_ERRNO + ErrorCode::ERR_MAX_ERRNO)); |
| } |
| |
| void Sandbox::policySanityChecks(EvaluateSyscall syscallEvaluator, |
| EvaluateArguments) { |
| // Do some sanity checks on the policy. This will warn users if they do |
| // things that are likely unsafe and unintended. |
| // We also have similar checks later, when we actually compile the BPF |
| // program. That catches problems with incorrectly stacked evaluators. |
| if (!isDenied(syscallEvaluator(-1))) { |
| SANDBOX_DIE("Negative system calls should always be disallowed by policy"); |
| } |
| #ifndef NDEBUG |
| #if defined(__i386__) || defined(__x86_64__) |
| #if defined(__x86_64__) && defined(__ILP32__) |
| for (unsigned int sysnum = MIN_SYSCALL & ~0x40000000u; |
| sysnum <= (MAX_SYSCALL & ~0x40000000u); |
| ++sysnum) { |
| if (!isDenied(syscallEvaluator(sysnum))) { |
| SANDBOX_DIE("In x32 mode, you should not allow any non-x32 " |
| "system calls"); |
| } |
| } |
| #else |
| for (unsigned int sysnum = MIN_SYSCALL | 0x40000000u; |
| sysnum <= (MAX_SYSCALL | 0x40000000u); |
| ++sysnum) { |
| if (!isDenied(syscallEvaluator(sysnum))) { |
| SANDBOX_DIE("x32 system calls should be explicitly disallowed"); |
| } |
| } |
| #endif |
| #endif |
| #endif |
| // Check interesting boundary values just outside of the valid system call |
| // range: 0x7FFFFFFF, 0x80000000, 0xFFFFFFFF, MIN_SYSCALL-1, MAX_SYSCALL+1. |
| // They all should be denied. |
| if (!isDenied(syscallEvaluator(std::numeric_limits<int>::max())) || |
| !isDenied(syscallEvaluator(std::numeric_limits<int>::min())) || |
| !isDenied(syscallEvaluator(-1)) || |
| !isDenied(syscallEvaluator(static_cast<int>(MIN_SYSCALL) - 1)) || |
| !isDenied(syscallEvaluator(static_cast<int>(MAX_SYSCALL) + 1))) { |
| SANDBOX_DIE("Even for default-allow policies, you must never allow system " |
| "calls outside of the standard system call range"); |
| } |
| return; |
| } |
| |
| void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator, |
| EvaluateArguments argumentEvaluator) { |
| if (status_ == STATUS_ENABLED) { |
| SANDBOX_DIE("Cannot change policy after sandbox has started"); |
| } |
| policySanityChecks(syscallEvaluator, argumentEvaluator); |
| evaluators_.push_back(std::make_pair(syscallEvaluator, argumentEvaluator)); |
| } |
| |
| void Sandbox::installFilter(bool quiet) { |
| // Verify that the user pushed a policy. |
| if (evaluators_.empty()) { |
| filter_failed: |
| SANDBOX_DIE("Failed to configure system call filters"); |
| } |
| |
| // Set new SIGSYS handler |
| struct sigaction sa; |
| memset(&sa, 0, sizeof(sa)); |
| sa.sa_sigaction = &sigSys; |
| sa.sa_flags = SA_SIGINFO; |
| if (sigaction(SIGSYS, &sa, NULL) < 0) { |
| goto filter_failed; |
| } |
| |
| // Unmask SIGSYS |
| sigset_t mask; |
| if (sigemptyset(&mask) || |
| sigaddset(&mask, SIGSYS) || |
| sigprocmask(SIG_UNBLOCK, &mask, NULL)) { |
| goto filter_failed; |
| } |
| |
| // We can't handle stacked evaluators, yet. We'll get there eventually |
| // though. Hang tight. |
| if (evaluators_.size() != 1) { |
| SANDBOX_DIE("Not implemented"); |
| } |
| |
| // Assemble the BPF filter program. |
| Program *program = new Program(); |
| if (!program) { |
| SANDBOX_DIE("Out of memory"); |
| } |
| |
| // If the architecture doesn't match SECCOMP_ARCH, disallow the |
| // system call. |
| program->push_back((struct sock_filter) |
| BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, arch))); |
| program->push_back((struct sock_filter) |
| BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 1, 0)); |
| |
| program->push_back((struct sock_filter) |
| BPF_STMT(BPF_RET+BPF_K, |
| Kill("Invalid audit architecture in BPF filter").err())); |
| |
| // Grab the system call number, so that we can implement jump tables. |
| program->push_back((struct sock_filter) |
| BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, nr))); |
| |
| // On Intel architectures, verify that system call numbers are in the |
| // expected number range. The older i386 and x86-64 APIs clear bit 30 |
| // on all system calls. The newer x86-32 API always sets bit 30. |
| #if defined(__i386__) || defined(__x86_64__) |
| #if defined(__x86_64__) && defined(__ILP32__) |
| program->push_back((struct sock_filter) |
| BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 1, 0)); |
| #else |
| program->push_back((struct sock_filter) |
| BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, 1)); |
| #endif |
| program->push_back((struct sock_filter) |
| BPF_STMT(BPF_RET+BPF_K, |
| Kill("Illegal mixing of system call ABIs").err())); |
| #endif |
| |
| |
| { |
| // Evaluate all possible system calls and group their ErrorCodes into |
| // ranges of identical codes. |
| Ranges ranges; |
| findRanges(&ranges); |
| |
| // Compile the system call ranges to an optimized BPF program |
| RetInsns rets; |
| emitJumpStatements(program, &rets, ranges.begin(), ranges.end()); |
| emitReturnStatements(program, rets); |
| } |
| |
| // Make sure compilation resulted in BPF program that executes |
| // correctly. Otherwise, there is an internal error in our BPF compiler. |
| // There is really nothing the caller can do until the bug is fixed. |
| #ifndef NDEBUG |
| const char *err = NULL; |
| if (!Verifier::verifyBPF(*program, evaluators_, &err)) { |
| SANDBOX_DIE(err); |
| } |
| #endif |
| |
| // We want to be very careful in not imposing any requirements on the |
| // policies that are set with setSandboxPolicy(). This means, as soon as |
| // the sandbox is active, we shouldn't be relying on libraries that could |
| // be making system calls. This, for example, means we should avoid |
| // using the heap and we should avoid using STL functions. |
| // Temporarily copy the contents of the "program" vector into a |
| // stack-allocated array; and then explicitly destroy that object. |
| // This makes sure we don't ex- or implicitly call new/delete after we |
| // installed the BPF filter program in the kernel. Depending on the |
| // system memory allocator that is in effect, these operators can result |
| // in system calls to things like munmap() or brk(). |
| struct sock_filter bpf[program->size()]; |
| const struct sock_fprog prog = { |
| static_cast<unsigned short>(program->size()), bpf }; |
| memcpy(bpf, &(*program)[0], sizeof(bpf)); |
| delete program; |
| |
| // Release memory that is no longer needed |
| evaluators_.clear(); |
| errMap_.clear(); |
| |
| #if defined(SECCOMP_BPF_VALGRIND_HACKS) |
| // Valgrind is really not happy about our sandbox. Disable it when running |
| // in Valgrind. This feature is dangerous and should never be enabled by |
| // default. We protect it behind a pre-processor option. |
| if (!RUNNING_ON_VALGRIND) |
| #endif |
| { |
| // Install BPF filter program |
| if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { |
| SANDBOX_DIE(quiet ? NULL : "Kernel refuses to enable no-new-privs"); |
| } else { |
| if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { |
| SANDBOX_DIE(quiet ? NULL : "Kernel refuses to turn on BPF filters"); |
| } |
| } |
| } |
| |
| return; |
| } |
| |
| void Sandbox::findRanges(Ranges *ranges) { |
| // Please note that "struct seccomp_data" defines system calls as a signed |
| // int32_t, but BPF instructions always operate on unsigned quantities. We |
| // deal with this disparity by enumerating from MIN_SYSCALL to MAX_SYSCALL, |
| // and then verifying that the rest of the number range (both positive and |
| // negative) all return the same ErrorCode. |
| EvaluateSyscall evaluateSyscall = evaluators_.begin()->first; |
| uint32_t oldSysnum = 0; |
| ErrorCode oldErr = evaluateSyscall(oldSysnum); |
| for (uint32_t sysnum = std::max(1u, MIN_SYSCALL); |
| sysnum <= MAX_SYSCALL + 1; |
| ++sysnum) { |
| ErrorCode err = evaluateSyscall(static_cast<int>(sysnum)); |
| if (!err.Equals(oldErr)) { |
| ranges->push_back(Range(oldSysnum, sysnum-1, oldErr)); |
| oldSysnum = sysnum; |
| oldErr = err; |
| } |
| } |
| |
| // As we looped all the way past the valid system calls (i.e. MAX_SYSCALL+1), |
| // "oldErr" should at this point be the "default" policy for all system call |
| // numbers that don't have an explicit handler in the system call evaluator. |
| // But as we are quite paranoid, we perform some more sanity checks to verify |
| // that there actually is a consistent "default" policy in the first place. |
| // We don't actually iterate over all possible 2^32 values, though. We just |
| // perform spot checks at the boundaries. |
| // The cases that we test are: 0x7FFFFFFF, 0x80000000, 0xFFFFFFFF. |
| if (!oldErr.Equals(evaluateSyscall(std::numeric_limits<int>::max())) || |
| !oldErr.Equals(evaluateSyscall(std::numeric_limits<int>::min())) || |
| !oldErr.Equals(evaluateSyscall(-1))) { |
| SANDBOX_DIE("Invalid seccomp policy"); |
| } |
| ranges->push_back( |
| Range(oldSysnum, std::numeric_limits<unsigned>::max(), oldErr)); |
| } |
| |
| void Sandbox::emitJumpStatements(Program *program, RetInsns *rets, |
| Ranges::const_iterator start, |
| Ranges::const_iterator stop) { |
| // We convert the list of system call ranges into jump table that performs |
| // a binary search over the ranges. |
| // As a sanity check, we need to have at least two distinct ranges for us |
| // to be able to build a jump table. |
| if (stop - start <= 1) { |
| SANDBOX_DIE("Invalid set of system call ranges"); |
| } |
| |
| // Pick the range object that is located at the mid point of our list. |
| // We compare our system call number against the lowest valid system call |
| // number in this range object. If our number is lower, it is outside of |
| // this range object. If it is greater or equal, it might be inside. |
| Ranges::const_iterator mid = start + (stop - start)/2; |
| Program::size_type jmp = program->size(); |
| if (jmp >= SECCOMP_MAX_PROGRAM_SIZE) { |
| compiler_err: |
| SANDBOX_DIE("Internal compiler error; failed to compile jump table"); |
| } |
| program->push_back((struct sock_filter) |
| BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, mid->from, |
| // Jump targets are place-holders that will be fixed up later. |
| 0, 0)); |
| |
| // The comparison turned out to be false; i.e. our system call number is |
| // less than the range object at the mid point of the list. |
| if (mid - start == 1) { |
| // If we have narrowed things down to a single range object, we can |
| // return from the BPF filter program. |
| // Instead of emitting a BPF_RET statement, we want to coalesce all |
| // identical BPF_RET statements into a single instance. This results in |
| // a more efficient BPF program that uses less CPU cache. |
| // Since all branches in BPF programs have to be forward branches, we |
| // keep track of our current instruction pointer and then fix up the |
| // branch when we emit the BPF_RET statement in emitReturnStatements(). |
| (*rets)[start->err.err()].push_back(FixUp(jmp, false)); |
| } else { |
| // Sub-divide the list of ranges and continue recursively. |
| emitJumpStatements(program, rets, start, mid); |
| } |
| |
| // The comparison turned out to be true; i.e. our system call number is |
| // greater or equal to the range object at the mid point of the list. |
| if (stop - mid == 1) { |
| // We narrowed things down to a single range object. Remember instruction |
| // pointer and exit code, so that we can patch up the target of the jump |
| // instruction in emitReturnStatements(). |
| (*rets)[mid->err.err()].push_back(FixUp(jmp, true)); |
| } else { |
| // We now know where the block of instructions for the "true" comparison |
| // starts. Patch up the jump target of the BPF_JMP instruction that we |
| // emitted earlier. |
| int distance = program->size() - jmp - 1; |
| if (distance < 0 || distance > 255) { |
| goto compiler_err; |
| } |
| (*program)[jmp].jt = distance; |
| |
| // Sub-divide the list of ranges and continue recursively. |
| emitJumpStatements(program, rets, mid, stop); |
| } |
| } |
| |
| void Sandbox::emitReturnStatements(Program *program, const RetInsns& rets) { |
| // Iterate over the list of distinct exit codes from our BPF filter |
| // program and emit the BPF_RET statements. |
| for (RetInsns::const_iterator ret_iter = rets.begin(); |
| ret_iter != rets.end(); |
| ++ret_iter) { |
| Program::size_type ip = program->size(); |
| if (ip >= SECCOMP_MAX_PROGRAM_SIZE) { |
| SANDBOX_DIE("Internal compiler error; failed to compile jump table"); |
| } |
| program->push_back((struct sock_filter) |
| BPF_STMT(BPF_RET+BPF_K, ret_iter->first)); |
| |
| // Iterate over the instruction pointers for the BPF_JMP instructions |
| // that need to be patched up. |
| for (std::vector<FixUp>::const_iterator insn_iter=ret_iter->second.begin(); |
| insn_iter != ret_iter->second.end(); |
| ++insn_iter) { |
| // Jumps are always relative and they are always forward. |
| int distance = ip - insn_iter->addr - 1; |
| if (distance < 0 || distance > 255) { |
| SANDBOX_DIE("Internal compiler error; failed to compile jump table"); |
| } |
| |
| // Decide whether we need to patch up the "true" or the "false" jump |
| // target. |
| if (insn_iter->jt) { |
| (*program)[insn_iter->addr].jt = distance; |
| } else { |
| (*program)[insn_iter->addr].jf = distance; |
| } |
| } |
| } |
| } |
| |
| void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) { |
| // Various sanity checks to make sure we actually received a signal |
| // triggered by a BPF filter. If something else triggered SIGSYS |
| // (e.g. kill()), there is really nothing we can do with this signal. |
| if (nr != SIGSYS || info->si_code != SYS_SECCOMP || !void_context || |
| info->si_errno <= 0 || |
| static_cast<size_t>(info->si_errno) > trapArraySize_) { |
| // SANDBOX_DIE() can call LOG(FATAL). This is not normally async-signal |
| // safe and can lead to bugs. We should eventually implement a different |
| // logging and reporting mechanism that is safe to be called from |
| // the sigSys() handler. |
| // TODO: If we feel confident that our code otherwise works correctly, we |
| // could actually make an argument that spurious SIGSYS should |
| // just get silently ignored. TBD |
| sigsys_err: |
| SANDBOX_DIE("Unexpected SIGSYS received"); |
| } |
| |
| // Signal handlers should always preserve "errno". Otherwise, we could |
| // trigger really subtle bugs. |
| int old_errno = errno; |
| |
| // Obtain the signal context. This, most notably, gives us access to |
| // all CPU registers at the time of the signal. |
| ucontext_t *ctx = reinterpret_cast<ucontext_t *>(void_context); |
| |
| // Obtain the siginfo information that is specific to SIGSYS. Unfortunately, |
| // most versions of glibc don't include this information in siginfo_t. So, |
| // we need to explicitly copy it into a arch_sigsys structure. |
| struct arch_sigsys sigsys; |
| memcpy(&sigsys, &info->_sifields, sizeof(sigsys)); |
| |
| // Some more sanity checks. |
| if (sigsys.ip != reinterpret_cast<void *>(SECCOMP_IP(ctx)) || |
| sigsys.nr != static_cast<int>(SECCOMP_SYSCALL(ctx)) || |
| sigsys.arch != SECCOMP_ARCH) { |
| goto sigsys_err; |
| } |
| |
| // Copy the seccomp-specific data into a arch_seccomp_data structure. This |
| // is what we are showing to TrapFnc callbacks that the system call evaluator |
| // registered with the sandbox. |
| struct arch_seccomp_data data = { |
| sigsys.nr, |
| SECCOMP_ARCH, |
| reinterpret_cast<uint64_t>(sigsys.ip), |
| { |
| static_cast<uint64_t>(SECCOMP_PARM1(ctx)), |
| static_cast<uint64_t>(SECCOMP_PARM2(ctx)), |
| static_cast<uint64_t>(SECCOMP_PARM3(ctx)), |
| static_cast<uint64_t>(SECCOMP_PARM4(ctx)), |
| static_cast<uint64_t>(SECCOMP_PARM5(ctx)), |
| static_cast<uint64_t>(SECCOMP_PARM6(ctx)) |
| } |
| }; |
| |
| // Now call the TrapFnc callback associated with this particular instance |
| // of SECCOMP_RET_TRAP. |
| const ErrorCode& err = trapArray_[info->si_errno - 1]; |
| intptr_t rc = err.fnc_(data, err.aux_); |
| |
| // Update the CPU register that stores the return code of the system call |
| // that we just handled, and restore "errno" to the value that it had |
| // before entering the signal handler. |
| SECCOMP_RESULT(ctx) = static_cast<greg_t>(rc); |
| errno = old_errno; |
| |
| return; |
| } |
| |
| ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) { |
| // Each unique pair of TrapFnc and auxiliary data make up a distinct instance |
| // of a SECCOMP_RET_TRAP. |
| std::pair<ErrorCode::TrapFnc, const void *> key(fnc, aux); |
| TrapIds::const_iterator iter = trapIds_.find(key); |
| uint16_t id; |
| if (iter != trapIds_.end()) { |
| // We have seen this pair before. Return the same id that we assigned |
| // earlier. |
| id = iter->second; |
| } else { |
| // This is a new pair. Remember it and assign a new id. |
| // Please note that we have to store traps in memory that doesn't get |
| // deallocated when the program is shutting down. A memory leak is |
| // intentional, because we might otherwise not be able to execute |
| // system calls part way through the program shutting down |
| if (!traps_) { |
| traps_ = new Traps(); |
| } |
| if (traps_->size() >= SECCOMP_RET_DATA) { |
| // In practice, this is pretty much impossible to trigger, as there |
| // are other kernel limitations that restrict overall BPF program sizes. |
| SANDBOX_DIE("Too many SECCOMP_RET_TRAP callback instances"); |
| } |
| id = traps_->size() + 1; |
| |
| traps_->push_back(ErrorCode(fnc, aux, id)); |
| trapIds_[key] = id; |
| |
| // We want to access the traps_ vector from our signal handler. But |
| // we are not assured that doing so is async-signal safe. On the other |
| // hand, C++ guarantees that the contents of a vector is stored in a |
| // contiguous C-style array. |
| // So, we look up the address and size of this array outside of the |
| // signal handler, where we can safely do so. |
| trapArray_ = &(*traps_)[0]; |
| trapArraySize_ = id; |
| } |
| |
| ErrorCode err = ErrorCode(fnc, aux, id); |
| return errMap_[err.err()] = err; |
| } |
| |
| intptr_t Sandbox::bpfFailure(const struct arch_seccomp_data&, void *aux) { |
| SANDBOX_DIE(static_cast<char *>(aux)); |
| } |
| |
| ErrorCode Sandbox::Kill(const char *msg) { |
| return Trap(bpfFailure, const_cast<char *>(msg)); |
| } |
| |
| Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN; |
| int Sandbox::proc_fd_ = -1; |
| Sandbox::Evaluators Sandbox::evaluators_; |
| Sandbox::ErrMap Sandbox::errMap_; |
| Sandbox::Traps *Sandbox::traps_ = NULL; |
| Sandbox::TrapIds Sandbox::trapIds_; |
| ErrorCode *Sandbox::trapArray_ = NULL; |
| size_t Sandbox::trapArraySize_ = 0; |
| |
| } // namespace |