blob: a50ddb38cd90381c2e7c20b724cb7d8a43e72a33 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SANDBOX_LINUX_SECCOMP_BPF_SANDBOX_BPF_H__
#define SANDBOX_LINUX_SECCOMP_BPF_SANDBOX_BPF_H__
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/audit.h>
#include <linux/filter.h>
// #include <linux/seccomp.h>
#include <linux/unistd.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <sched.h>
#include <signal.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#include <sys/shm.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>
#include <algorithm>
#include <limits>
#include <map>
#include <utility>
#include <vector>
#ifndef SECCOMP_BPF_STANDALONE
#include "base/basictypes.h"
#include "base/eintr_wrapper.h"
#include "base/logging.h"
#endif
#if defined(SECCOMP_BPF_VALGRIND_HACKS)
#ifndef SECCOMP_BPF_STANDALONE
#include "base/third_party/valgrind/valgrind.h"
#endif
#endif
// The Seccomp2 kernel ABI is not part of older versions of glibc.
// As we can't break compilation with these versions of the library,
// we explicitly define all missing symbols.
#ifndef PR_SET_NO_NEW_PRIVS
#define PR_SET_NO_NEW_PRIVS 38
#define PR_GET_NO_NEW_PRIVS 39
#endif
#ifndef IPC_64
#define IPC_64 0x0100
#endif
#ifndef SECCOMP_MODE_FILTER
#define SECCOMP_MODE_DISABLED 0
#define SECCOMP_MODE_STRICT 1
#define SECCOMP_MODE_FILTER 2 // User user-supplied filter
#define SECCOMP_RET_KILL 0x00000000U // Kill the task immediately
#define SECCOMP_RET_TRAP 0x00030000U // Disallow and force a SIGSYS
#define SECCOMP_RET_ERRNO 0x00050000U // Returns an errno
#define SECCOMP_RET_TRACE 0x7ff00000U // Pass to a tracer or disallow
#define SECCOMP_RET_ALLOW 0x7fff0000U // Allow
#define SECCOMP_RET_INVALID 0x8f8f8f8fU // Illegal return value
#define SECCOMP_RET_ACTION 0xffff0000U // Masks for the return value
#define SECCOMP_RET_DATA 0x0000ffffU // sections
#endif
#define SECCOMP_DENY_ERRNO EPERM
#ifndef SYS_SECCOMP
#define SYS_SECCOMP 1
#endif
// Impose some reasonable maximum BPF program size. Realistically, the
// kernel probably has much lower limits. But by limiting to less than
// 30 bits, we can ease requirements on some of our data types.
#define SECCOMP_MAX_PROGRAM_SIZE (1<<30)
#if defined(__i386__)
#define MIN_SYSCALL 0u
#define MAX_PUBLIC_SYSCALL 1024u
#define MAX_SYSCALL MAX_PUBLIC_SYSCALL
#define SECCOMP_ARCH AUDIT_ARCH_I386
#define SECCOMP_REG(_ctx, _reg) ((_ctx)->uc_mcontext.gregs[(_reg)])
#define SECCOMP_RESULT(_ctx) SECCOMP_REG(_ctx, REG_EAX)
#define SECCOMP_SYSCALL(_ctx) SECCOMP_REG(_ctx, REG_EAX)
#define SECCOMP_IP(_ctx) SECCOMP_REG(_ctx, REG_EIP)
#define SECCOMP_PARM1(_ctx) SECCOMP_REG(_ctx, REG_EBX)
#define SECCOMP_PARM2(_ctx) SECCOMP_REG(_ctx, REG_ECX)
#define SECCOMP_PARM3(_ctx) SECCOMP_REG(_ctx, REG_EDX)
#define SECCOMP_PARM4(_ctx) SECCOMP_REG(_ctx, REG_ESI)
#define SECCOMP_PARM5(_ctx) SECCOMP_REG(_ctx, REG_EDI)
#define SECCOMP_PARM6(_ctx) SECCOMP_REG(_ctx, REG_EBP)
#elif defined(__x86_64__)
#define MIN_SYSCALL 0u
#define MAX_PUBLIC_SYSCALL 1024u
#define MAX_SYSCALL MAX_PUBLIC_SYSCALL
#define SECCOMP_ARCH AUDIT_ARCH_X86_64
#define SECCOMP_REG(_ctx, _reg) ((_ctx)->uc_mcontext.gregs[(_reg)])
#define SECCOMP_RESULT(_ctx) SECCOMP_REG(_ctx, REG_RAX)
#define SECCOMP_SYSCALL(_ctx) SECCOMP_REG(_ctx, REG_RAX)
#define SECCOMP_IP(_ctx) SECCOMP_REG(_ctx, REG_RIP)
#define SECCOMP_PARM1(_ctx) SECCOMP_REG(_ctx, REG_RDI)
#define SECCOMP_PARM2(_ctx) SECCOMP_REG(_ctx, REG_RSI)
#define SECCOMP_PARM3(_ctx) SECCOMP_REG(_ctx, REG_RDX)
#define SECCOMP_PARM4(_ctx) SECCOMP_REG(_ctx, REG_R10)
#define SECCOMP_PARM5(_ctx) SECCOMP_REG(_ctx, REG_R8)
#define SECCOMP_PARM6(_ctx) SECCOMP_REG(_ctx, REG_R9)
#elif defined(__arm__) && (defined(__thumb__) || defined(__ARM_EABI__))
// ARM EABI includes "ARM private" system calls starting at |__ARM_NR_BASE|,
// and a "ghost syscall private to the kernel", cmpxchg,
// at |__ARM_NR_BASE+0x00fff0|.
// See </arch/arm/include/asm/unistd.h> in the Linux kernel.
#define MIN_SYSCALL ((unsigned int)__NR_SYSCALL_BASE)
#define MAX_PUBLIC_SYSCALL (MIN_SYSCALL + 1024u)
#define MIN_PRIVATE_SYSCALL ((unsigned int)__ARM_NR_BASE)
#define MAX_PRIVATE_SYSCALL (MIN_PRIVATE_SYSCALL + 16u)
#define MIN_GHOST_SYSCALL ((unsigned int)__ARM_NR_BASE + 0xfff0u)
#define MAX_SYSCALL (MIN_GHOST_SYSCALL + 4u)
// <linux/audit.h> includes <linux/elf-em.h>, which does not define EM_ARM.
// <linux/elf.h> only includes <asm/elf.h> if we're in the kernel.
# if !defined(EM_ARM)
# define EM_ARM 40
# endif
#define SECCOMP_ARCH AUDIT_ARCH_ARM
// ARM sigcontext_t is different from i386/x86_64.
// See </arch/arm/include/asm/sigcontext.h> in the Linux kernel.
#define SECCOMP_REG(_ctx, _reg) ((_ctx)->uc_mcontext.arm_##_reg)
// ARM EABI syscall convention.
#define SECCOMP_RESULT(_ctx) SECCOMP_REG(_ctx, r0)
#define SECCOMP_SYSCALL(_ctx) SECCOMP_REG(_ctx, r7)
#define SECCOMP_IP(_ctx) SECCOMP_REG(_ctx, pc)
#define SECCOMP_PARM1(_ctx) SECCOMP_REG(_ctx, r0)
#define SECCOMP_PARM2(_ctx) SECCOMP_REG(_ctx, r1)
#define SECCOMP_PARM3(_ctx) SECCOMP_REG(_ctx, r2)
#define SECCOMP_PARM4(_ctx) SECCOMP_REG(_ctx, r3)
#define SECCOMP_PARM5(_ctx) SECCOMP_REG(_ctx, r4)
#define SECCOMP_PARM6(_ctx) SECCOMP_REG(_ctx, r5)
#else
#error Unsupported target platform
#endif
#if defined(SECCOMP_BPF_STANDALONE)
#define arraysize(x) (sizeof(x)/sizeof(*(x)))
#define HANDLE_EINTR TEMP_FAILURE_RETRY
#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
TypeName(); \
TypeName(const TypeName&); \
void operator=(const TypeName&)
#endif
#include "sandbox/linux/seccomp-bpf/die.h"
#include "sandbox/linux/seccomp-bpf/errorcode.h"
namespace playground2 {
struct arch_seccomp_data {
int nr;
uint32_t arch;
uint64_t instruction_pointer;
uint64_t args[6];
};
struct arch_sigsys {
void *ip;
int nr;
unsigned int arch;
};
class CodeGen;
class SandboxUnittestHelper;
struct Instruction;
class Sandbox {
public:
enum SandboxStatus {
STATUS_UNKNOWN, // Status prior to calling supportsSeccompSandbox()
STATUS_UNSUPPORTED, // The kernel does not appear to support sandboxing
STATUS_UNAVAILABLE, // Currently unavailable but might work again later
STATUS_AVAILABLE, // Sandboxing is available but not currently active
STATUS_ENABLED // The sandbox is now active
};
// TrapFnc is a pointer to a function that handles Seccomp traps in
// user-space. The seccomp policy can request that a trap handler gets
// installed; it does so by returning a suitable ErrorCode() from the
// syscallEvaluator. See the ErrorCode() constructor for how to pass in
// the function pointer.
// Please note that TrapFnc is executed from signal context and must be
// async-signal safe:
// http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_04.html
typedef intptr_t (*TrapFnc)(const struct arch_seccomp_data& args, void *aux);
enum Operation {
OP_NOP, OP_EQUAL, OP_NOTEQUAL, OP_LESS,
OP_LESS_EQUAL, OP_GREATER, OP_GREATER_EQUAL,
OP_HAS_BITS, OP_DOES_NOT_HAVE_BITS
};
struct Constraint {
bool is32bit;
Operation op;
uint32_t value;
ErrorCode passed;
ErrorCode failed;
};
// When calling setSandboxPolicy(), the caller can provide an arbitrary
// pointer. This pointer will then be forwarded to the sandbox policy
// each time a call is made through an EvaluateSyscall function pointer.
// One common use case would be to pass the "aux" pointer as an argument
// to Trap() functions.
typedef ErrorCode (*EvaluateSyscall)(int sysnum, void *aux);
typedef std::vector<std::pair<EvaluateSyscall, void *> >Evaluators;
// Checks whether a particular system call number is valid on the current
// architecture. E.g. on ARM there's a non-contiguous range of private
// system calls.
static bool isValidSyscallNumber(int sysnum);
// There are a lot of reasons why the Seccomp sandbox might not be available.
// This could be because the kernel does not support Seccomp mode, or it
// could be because another sandbox is already active.
// "proc_fd" should be a file descriptor for "/proc", or -1 if not
// provided by the caller.
static SandboxStatus supportsSeccompSandbox(int proc_fd);
// The sandbox needs to be able to access files in "/proc/self". If this
// directory is not accessible when "startSandbox()" gets called, the caller
// can provide an already opened file descriptor by calling "setProcFd()".
// The sandbox becomes the new owner of this file descriptor and will
// eventually close it when "startSandbox()" executes.
static void setProcFd(int proc_fd);
// The system call evaluator function is called with the system
// call number. It can decide to allow the system call unconditionally
// by returning ERR_ALLOWED; it can deny the system call unconditionally by
// returning an appropriate "errno" value; or it can request inspection
// of system call argument(s) by returning a suitable ErrorCode.
// The "aux" parameter can be used to pass optional data to the system call
// evaluator. There are different possible uses for this data, but one of the
// use cases would be for the policy to then forward this pointer to a Trap()
// handler. In this case, of course, the data that is pointed to must remain
// valid for the entire time that Trap() handlers can be called; typically,
// this would be the lifetime of the program.
static void setSandboxPolicy(EvaluateSyscall syscallEvaluator, void *aux);
// We can use ErrorCode to request calling of a trap handler. This method
// performs the required wrapping of the callback function into an
// ErrorCode object.
// The "aux" field can carry a pointer to arbitrary data. See EvaluateSyscall
// for a description of how to pass data from setSandboxPolicy() to a Trap()
// handler.
static ErrorCode Trap(ErrorCode::TrapFnc fnc, const void *aux);
// Kill the program and print an error message.
static ErrorCode Kill(const char *msg);
// This is the main public entry point. It finds all system calls that
// need rewriting, sets up the resources needed by the sandbox, and
// enters Seccomp mode.
static void startSandbox() { startSandboxInternal(false); }
private:
friend class ErrorCode;
friend class CodeGen;
friend class SandboxUnittestHelper;
friend class Util;
friend class Verifier;
typedef std::vector<struct sock_filter> Program;
struct Range {
Range(uint32_t f, uint32_t t, const ErrorCode& e) :
from(f),
to(t),
err(e) {
}
uint32_t from, to;
ErrorCode err;
};
typedef std::vector<Range> Ranges;
typedef std::map<uint32_t, ErrorCode> ErrMap;
typedef std::vector<ErrorCode> Traps;
typedef std::map<std::pair<TrapFnc, const void *>, int> TrapIds;
// Get a file descriptor pointing to "/proc", if currently available.
static int proc_fd() { return proc_fd_; }
static ErrorCode probeEvaluator(int sysnum, void *) __attribute__((const));
static void probeProcess(void);
static ErrorCode allowAllEvaluator(int sysnum, void *aux);
static void tryVsyscallProcess(void);
static bool kernelSupportSeccompBPF(int proc_fd);
static bool RunFunctionInPolicy(void (*function)(),
EvaluateSyscall syscallEvaluator,
void *aux,
int proc_fd);
static void startSandboxInternal(bool quiet);
static bool isSingleThreaded(int proc_fd);
static bool isDenied(const ErrorCode& code);
static bool disableFilesystem();
static void policySanityChecks(EvaluateSyscall syscallEvaluator,
void *aux);
static void installFilter(bool quiet);
static void findRanges(Ranges *ranges);
static Instruction *assembleJumpTable(CodeGen *gen,
Ranges::const_iterator start,
Ranges::const_iterator stop);
static void sigSys(int nr, siginfo_t *info, void *void_context);
static intptr_t bpfFailure(const struct arch_seccomp_data& data, void *aux);
static int getTrapId(TrapFnc fnc, const void *aux);
static SandboxStatus status_;
static int proc_fd_;
static Evaluators evaluators_;
static ErrMap errMap_;
static Traps *traps_;
static TrapIds trapIds_;
static ErrorCode *trapArray_;
static size_t trapArraySize_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Sandbox);
};
} // namespace
#endif // SANDBOX_LINUX_SECCOMP_BPF_SANDBOX_BPF_H__