linux/services/credentials.cc - chromium/src/sandbox - Git at Google

 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "sandbox/linux/services/credentials.h"

 #include <errno.h>
 #include <limits.h>
 #include <sched.h>
 #include <signal.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>

 #include "base/bind.h"
 #include "base/compiler_specific.h"
 #include "base/cxx17_backports.h"
 #include "base/files/file_path.h"
 #include "base/files/file_util.h"
 #include "base/logging.h"
 #include "base/posix/eintr_wrapper.h"
 #include "base/process/launch.h"
 #include "build/build_config.h"
 #include "sandbox/linux/services/namespace_utils.h"
 #include "sandbox/linux/services/proc_util.h"
 #include "sandbox/linux/services/syscall_wrappers.h"
 #include "sandbox/linux/services/thread_helpers.h"
 #include "sandbox/linux/system_headers/capability.h"
 #include "sandbox/linux/system_headers/linux_signal.h"

 namespace sandbox {

 namespace {

 const int kExitSuccess = 0;
 #if !defined(THREAD_SANITIZER)
 const int kExitFailure = 1;
 #endif

 #if defined(__clang__)
 // Disable sanitizers that rely on TLS and may write to non-stack memory.
 __attribute__((no_sanitize_address))
 __attribute__((no_sanitize_thread))
 __attribute__((no_sanitize_memory))
 #endif
 int ChrootToSelfFdinfo(void*) {
   // This function can be run from a vforked child, so it should not write to
   // any memory other than the stack or errno. Reads from TLS may be different
   // from in the parent process.
   RAW_CHECK(sys_chroot("/proc/self/fdinfo/") == 0);

   // CWD is essentially an implicit file descriptor, so be careful to not
   // leave it behind.
   RAW_CHECK(chdir("/") == 0);
   _exit(kExitSuccess);
 }

 // chroot() to an empty dir that is "safe". To be safe, it must not contain
 // any subdirectory (chroot-ing there would allow a chroot escape) and it must
 // be impossible to create an empty directory there.
 // We achieve this by doing the following:
 // 1. We create a new process sharing file system information.
 // 2. In the child, we chroot to /proc/self/fdinfo/
 // This is already "safe", since fdinfo/ does not contain another directory and
 // one cannot create another directory there.
 // 3. The process dies
 // After (3) happens, the directory is not available anymore in /proc.
 bool ChrootToSafeEmptyDir() {
   // We need to chroot to a fdinfo that is unique to a process and have that
   // process die.
   // 1. We don't want to simply fork() because duplicating the page tables is
   // slow with a big address space.
   // 2. We do not use a regular thread (that would unshare CLONE_FILES) because
   // when we are in a PID namespace, we cannot easily get a handle to the
   // /proc/tid directory for the thread (since /proc may not be aware of the
   // PID namespace). With a process, we can just use /proc/self.
   pid_t pid = -1;
   alignas(16) char stack_buf[PTHREAD_STACK_MIN];
 #if defined(ARCH_CPU_X86_FAMILY) || defined(ARCH_CPU_ARM_FAMILY) || \
     defined(ARCH_CPU_MIPS_FAMILY)
   // The stack grows downward.
   void* stack = stack_buf + sizeof(stack_buf);
 #else
 #error "Unsupported architecture"
 #endif

   int clone_flags = CLONE_FS | LINUX_SIGCHLD;
   void* tls = nullptr;
 #if (defined(ARCH_CPU_X86_64) || defined(ARCH_CPU_ARM_FAMILY)) && \
     !defined(MEMORY_SANITIZER)
   // Use CLONE_VM | CLONE_VFORK as an optimization to avoid copying page tables.
   // Since clone writes to the new child's TLS before returning, we must set a
   // new TLS to avoid corrupting the current process's TLS. On ARCH_CPU_X86,
   // glibc performs syscalls by calling a function pointer in TLS, so we do not
   // attempt this optimization.
   // TODO(crbug.com/1247458) Broken in MSan builds after LLVM f1bb30a4956f.
   clone_flags |= CLONE_VM | CLONE_VFORK | CLONE_SETTLS;

   // PTHREAD_STACK_MIN can be dynamic in glibc2.34+, so it is not possible to
   // zeroify tls_buf assigning { 0 }
   char tls_buf[PTHREAD_STACK_MIN];
   memset(tls_buf, 0, PTHREAD_STACK_MIN);
   tls = tls_buf;
 #endif

   pid = clone(ChrootToSelfFdinfo, stack, clone_flags, nullptr, nullptr, tls,
               nullptr);
   PCHECK(pid != -1);

   int status = -1;
   PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid);

   return WIFEXITED(status) && WEXITSTATUS(status) == kExitSuccess;
 }

 // CHECK() that an attempt to move to a new user namespace raised an expected
 // errno.
 void CheckCloneNewUserErrno(int error) {
   // EPERM can happen if already in a chroot. EUSERS if too many nested
   // namespaces are used. EINVAL for kernels that don't support the feature.
   // ENOSPC can occur when the system has reached its maximum configured
   // number of user namespaces.
   PCHECK(error == EPERM || error == EUSERS || error == EINVAL ||
          error == ENOSPC);
 }

 // Converts a Capability to the corresponding Linux CAP_XXX value.
 int CapabilityToKernelValue(Credentials::Capability cap) {
   switch (cap) {
     case Credentials::Capability::SYS_CHROOT:
       return CAP_SYS_CHROOT;
     case Credentials::Capability::SYS_ADMIN:
       return CAP_SYS_ADMIN;
   }

   LOG(FATAL) << "Invalid Capability: " << static_cast<int>(cap);
   return 0;
 }

 }  // namespace.

 // static
 bool Credentials::GetRESIds(uid_t* resuid, gid_t* resgid) {
   uid_t ruid, euid, suid;
   gid_t rgid, egid, sgid;
   PCHECK(sys_getresuid(&ruid, &euid, &suid) == 0);
   PCHECK(sys_getresgid(&rgid, &egid, &sgid) == 0);
   const bool uids_are_equal = (ruid == euid) && (ruid == suid);
   const bool gids_are_equal = (rgid == egid) && (rgid == sgid);
   if (!uids_are_equal || !gids_are_equal) return false;
   if (resuid) *resuid = euid;
   if (resgid) *resgid = egid;
   return true;
 }

 // static
 bool Credentials::SetGidAndUidMaps(gid_t gid, uid_t uid) {
   const char kGidMapFile[] = "/proc/self/gid_map";
   const char kUidMapFile[] = "/proc/self/uid_map";
   if (NamespaceUtils::KernelSupportsDenySetgroups() &&
       !NamespaceUtils::DenySetgroups()) {
     return false;
   }
   DCHECK(GetRESIds(NULL, NULL));
   if (!NamespaceUtils::WriteToIdMapFile(kGidMapFile, gid) ||
       !NamespaceUtils::WriteToIdMapFile(kUidMapFile, uid)) {
     return false;
   }
   DCHECK(GetRESIds(NULL, NULL));
   return true;
 }

 // static
 bool Credentials::DropAllCapabilities(int proc_fd) {
   if (!SetCapabilities(proc_fd, std::vector<Capability>())) {
     return false;
   }

   CHECK(!HasAnyCapability());
   return true;
 }

 // static
 bool Credentials::DropAllCapabilities() {
   base::ScopedFD proc_fd(ProcUtil::OpenProc());
   return Credentials::DropAllCapabilities(proc_fd.get());
 }

 // static
 bool Credentials::DropAllCapabilitiesOnCurrentThread() {
   return SetCapabilitiesOnCurrentThread(std::vector<Capability>());
 }

 // static
 bool Credentials::SetCapabilitiesOnCurrentThread(
     const std::vector<Capability>& caps) {
   struct cap_hdr hdr = {};
   hdr.version = _LINUX_CAPABILITY_VERSION_3;
   struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};

   // Initially, cap has no capability flags set. Enable the effective and
   // permitted flags only for the requested capabilities.
   for (const Capability cap : caps) {
     const int cap_num = CapabilityToKernelValue(cap);
     const size_t index = CAP_TO_INDEX(cap_num);
     const uint32_t mask = CAP_TO_MASK(cap_num);
     data[index].effective |= mask;
     data[index].permitted |= mask;
   }

   return sys_capset(&hdr, data) == 0;
 }

 // static
 bool Credentials::SetCapabilities(int proc_fd,
                                   const std::vector<Capability>& caps) {
   DCHECK_LE(0, proc_fd);

 #if !defined(THREAD_SANITIZER)
   // With TSAN, accept to break the security model as it is a testing
   // configuration.
   CHECK(ThreadHelpers::IsSingleThreaded(proc_fd));
 #endif

   return SetCapabilitiesOnCurrentThread(caps);
 }

 bool Credentials::HasAnyCapability() {
   struct cap_hdr hdr = {};
   hdr.version = _LINUX_CAPABILITY_VERSION_3;
   struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};

   PCHECK(sys_capget(&hdr, data) == 0);

   for (size_t i = 0; i < base::size(data); ++i) {
     if (data[i].effective || data[i].permitted || data[i].inheritable) {
       return true;
     }
   }

   return false;
 }

 bool Credentials::HasCapability(Capability cap) {
   struct cap_hdr hdr = {};
   hdr.version = _LINUX_CAPABILITY_VERSION_3;
   struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};

   PCHECK(sys_capget(&hdr, data) == 0);

   const int cap_num = CapabilityToKernelValue(cap);
   const size_t index = CAP_TO_INDEX(cap_num);
   const uint32_t mask = CAP_TO_MASK(cap_num);

   return (data[index].effective | data[index].permitted |
           data[index].inheritable) &
          mask;
 }

 // static
 bool Credentials::CanCreateProcessInNewUserNS() {
 #if defined(THREAD_SANITIZER)
   // With TSAN, processes will always have threads running and can never
   // enter a new user namespace with MoveToNewUserNS().
   return false;
 #else
   uid_t uid;
   gid_t gid;
   if (!GetRESIds(&uid, &gid)) {
     return false;
   }

   const pid_t pid =
       base::ForkWithFlags(CLONE_NEWUSER | SIGCHLD, nullptr, nullptr);

   if (pid == -1) {
     CheckCloneNewUserErrno(errno);
     return false;
   }

   // The parent process could have had threads. In the child, these threads
   // have disappeared.
   if (pid == 0) {
     // unshare() requires the effective uid and gid to have a mapping in the
     // parent namespace.
     if (!SetGidAndUidMaps(gid, uid))
       _exit(kExitFailure);

     // Make sure we drop CAP_SYS_ADMIN.
     CHECK(sandbox::Credentials::DropAllCapabilities());

     // Ensure we have unprivileged use of CLONE_NEWUSER.  Debian
     // Jessie explicitly forbids this case.  See:
     // add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch
     if (sys_unshare(CLONE_NEWUSER))
       _exit(kExitFailure);

     _exit(kExitSuccess);
   }

   // Always reap the child.
   int status = -1;
   PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid);

   DCHECK(WIFEXITED(status) && (WEXITSTATUS(status) == kExitSuccess ||
                                WEXITSTATUS(status) == kExitFailure));

   // clone(2) succeeded.  Now return true only if the system grants
   // unprivileged use of CLONE_NEWUSER as well.
   return WIFEXITED(status) && WEXITSTATUS(status) == kExitSuccess;
 #endif
 }

 bool Credentials::MoveToNewUserNS() {
   uid_t uid;
   gid_t gid;
   if (!GetRESIds(&uid, &gid)) {
     // If all the uids (or gids) are not equal to each other, the security
     // model will most likely confuse the caller, abort.
     DVLOG(1) << "uids or gids differ!";
     return false;
   }
   int ret = sys_unshare(CLONE_NEWUSER);
   if (ret) {
     const int unshare_errno = errno;
     VLOG(1) << "Looks like unprivileged CLONE_NEWUSER may not be available "
             << "on this kernel.";
     CheckCloneNewUserErrno(unshare_errno);
     return false;
   }

   // The current {r,e,s}{u,g}id is now an overflow id (c.f.
   // /proc/sys/kernel/overflowuid). Setup the uid and gid maps.
   PCHECK(SetGidAndUidMaps(gid, uid));
   return true;
 }

 bool Credentials::DropFileSystemAccess(int proc_fd) {
   CHECK_LE(0, proc_fd);

   CHECK(ChrootToSafeEmptyDir());
   CHECK(!HasFileSystemAccess());
   CHECK(!ProcUtil::HasOpenDirectory(proc_fd));
   // We never let this function fail.
   return true;
 }

 bool Credentials::HasFileSystemAccess() {
   return base::DirectoryExists(base::FilePath("/proc"));
 }

 pid_t Credentials::ForkAndDropCapabilitiesInChild() {
   pid_t pid = fork();
   if (pid != 0) {
     return pid;
   }

   // Since we just forked, we are single threaded.
   PCHECK(DropAllCapabilitiesOnCurrentThread());
   return 0;
 }

 }  // namespace sandbox.
	// Copyright (c) 2013 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "sandbox/linux/services/credentials.h"

	#include <errno.h>
	#include <limits.h>
	#include <sched.h>
	#include <signal.h>
	#include <stddef.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <sys/syscall.h>
	#include <sys/types.h>
	#include <sys/wait.h>
	#include <unistd.h>

	#include "base/bind.h"
	#include "base/compiler_specific.h"
	#include "base/cxx17_backports.h"
	#include "base/files/file_path.h"
	#include "base/files/file_util.h"
	#include "base/logging.h"
	#include "base/posix/eintr_wrapper.h"
	#include "base/process/launch.h"
	#include "build/build_config.h"
	#include "sandbox/linux/services/namespace_utils.h"
	#include "sandbox/linux/services/proc_util.h"
	#include "sandbox/linux/services/syscall_wrappers.h"
	#include "sandbox/linux/services/thread_helpers.h"
	#include "sandbox/linux/system_headers/capability.h"
	#include "sandbox/linux/system_headers/linux_signal.h"

	namespace sandbox {

	namespace {

	const int kExitSuccess = 0;
	#if !defined(THREAD_SANITIZER)
	const int kExitFailure = 1;
	#endif

	#if defined(__clang__)
	// Disable sanitizers that rely on TLS and may write to non-stack memory.
	__attribute__((no_sanitize_address))
	__attribute__((no_sanitize_thread))
	__attribute__((no_sanitize_memory))
	#endif
	int ChrootToSelfFdinfo(void*) {
	// This function can be run from a vforked child, so it should not write to
	// any memory other than the stack or errno. Reads from TLS may be different
	// from in the parent process.
	RAW_CHECK(sys_chroot("/proc/self/fdinfo/") == 0);

	// CWD is essentially an implicit file descriptor, so be careful to not
	// leave it behind.
	RAW_CHECK(chdir("/") == 0);
	_exit(kExitSuccess);
	}

	// chroot() to an empty dir that is "safe". To be safe, it must not contain
	// any subdirectory (chroot-ing there would allow a chroot escape) and it must
	// be impossible to create an empty directory there.
	// We achieve this by doing the following:
	// 1. We create a new process sharing file system information.
	// 2. In the child, we chroot to /proc/self/fdinfo/
	// This is already "safe", since fdinfo/ does not contain another directory and
	// one cannot create another directory there.
	// 3. The process dies
	// After (3) happens, the directory is not available anymore in /proc.
	bool ChrootToSafeEmptyDir() {
	// We need to chroot to a fdinfo that is unique to a process and have that
	// process die.
	// 1. We don't want to simply fork() because duplicating the page tables is
	// slow with a big address space.
	// 2. We do not use a regular thread (that would unshare CLONE_FILES) because
	// when we are in a PID namespace, we cannot easily get a handle to the
	// /proc/tid directory for the thread (since /proc may not be aware of the
	// PID namespace). With a process, we can just use /proc/self.
	pid_t pid = -1;
	alignas(16) char stack_buf[PTHREAD_STACK_MIN];
	#if defined(ARCH_CPU_X86_FAMILY) \|\| defined(ARCH_CPU_ARM_FAMILY) \|\| \
	defined(ARCH_CPU_MIPS_FAMILY)
	// The stack grows downward.
	void* stack = stack_buf + sizeof(stack_buf);
	#else
	#error "Unsupported architecture"
	#endif

	int clone_flags = CLONE_FS \| LINUX_SIGCHLD;
	void* tls = nullptr;
	#if (defined(ARCH_CPU_X86_64) \|\| defined(ARCH_CPU_ARM_FAMILY)) && \
	!defined(MEMORY_SANITIZER)
	// Use CLONE_VM \| CLONE_VFORK as an optimization to avoid copying page tables.
	// Since clone writes to the new child's TLS before returning, we must set a
	// new TLS to avoid corrupting the current process's TLS. On ARCH_CPU_X86,
	// glibc performs syscalls by calling a function pointer in TLS, so we do not
	// attempt this optimization.
	// TODO(crbug.com/1247458) Broken in MSan builds after LLVM f1bb30a4956f.
	clone_flags \|= CLONE_VM \| CLONE_VFORK \| CLONE_SETTLS;

	// PTHREAD_STACK_MIN can be dynamic in glibc2.34+, so it is not possible to
	// zeroify tls_buf assigning { 0 }
	char tls_buf[PTHREAD_STACK_MIN];
	memset(tls_buf, 0, PTHREAD_STACK_MIN);
	tls = tls_buf;
	#endif

	pid = clone(ChrootToSelfFdinfo, stack, clone_flags, nullptr, nullptr, tls,
	nullptr);
	PCHECK(pid != -1);

	int status = -1;
	PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid);

	return WIFEXITED(status) && WEXITSTATUS(status) == kExitSuccess;
	}

	// CHECK() that an attempt to move to a new user namespace raised an expected
	// errno.
	void CheckCloneNewUserErrno(int error) {
	// EPERM can happen if already in a chroot. EUSERS if too many nested
	// namespaces are used. EINVAL for kernels that don't support the feature.
	// ENOSPC can occur when the system has reached its maximum configured
	// number of user namespaces.
	PCHECK(error == EPERM \|\| error == EUSERS \|\| error == EINVAL \|\|
	error == ENOSPC);
	}

	// Converts a Capability to the corresponding Linux CAP_XXX value.
	int CapabilityToKernelValue(Credentials::Capability cap) {
	switch (cap) {
	case Credentials::Capability::SYS_CHROOT:
	return CAP_SYS_CHROOT;
	case Credentials::Capability::SYS_ADMIN:
	return CAP_SYS_ADMIN;
	}

	LOG(FATAL) << "Invalid Capability: " << static_cast<int>(cap);
	return 0;
	}

	} // namespace.

	// static
	bool Credentials::GetRESIds(uid_t* resuid, gid_t* resgid) {
	uid_t ruid, euid, suid;
	gid_t rgid, egid, sgid;
	PCHECK(sys_getresuid(&ruid, &euid, &suid) == 0);
	PCHECK(sys_getresgid(&rgid, &egid, &sgid) == 0);
	const bool uids_are_equal = (ruid == euid) && (ruid == suid);
	const bool gids_are_equal = (rgid == egid) && (rgid == sgid);
	if (!uids_are_equal \|\| !gids_are_equal) return false;
	if (resuid) *resuid = euid;
	if (resgid) *resgid = egid;
	return true;
	}

	// static
	bool Credentials::SetGidAndUidMaps(gid_t gid, uid_t uid) {
	const char kGidMapFile[] = "/proc/self/gid_map";
	const char kUidMapFile[] = "/proc/self/uid_map";
	if (NamespaceUtils::KernelSupportsDenySetgroups() &&
	!NamespaceUtils::DenySetgroups()) {
	return false;
	}
	DCHECK(GetRESIds(NULL, NULL));
	if (!NamespaceUtils::WriteToIdMapFile(kGidMapFile, gid) \|\|
	!NamespaceUtils::WriteToIdMapFile(kUidMapFile, uid)) {
	return false;
	}
	DCHECK(GetRESIds(NULL, NULL));
	return true;
	}

	// static
	bool Credentials::DropAllCapabilities(int proc_fd) {
	if (!SetCapabilities(proc_fd, std::vector<Capability>())) {
	return false;
	}

	CHECK(!HasAnyCapability());
	return true;
	}

	// static
	bool Credentials::DropAllCapabilities() {
	base::ScopedFD proc_fd(ProcUtil::OpenProc());
	return Credentials::DropAllCapabilities(proc_fd.get());
	}

	// static
	bool Credentials::DropAllCapabilitiesOnCurrentThread() {
	return SetCapabilitiesOnCurrentThread(std::vector<Capability>());
	}

	// static
	bool Credentials::SetCapabilitiesOnCurrentThread(
	const std::vector<Capability>& caps) {
	struct cap_hdr hdr = {};
	hdr.version = _LINUX_CAPABILITY_VERSION_3;
	struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};

	// Initially, cap has no capability flags set. Enable the effective and
	// permitted flags only for the requested capabilities.
	for (const Capability cap : caps) {
	const int cap_num = CapabilityToKernelValue(cap);
	const size_t index = CAP_TO_INDEX(cap_num);
	const uint32_t mask = CAP_TO_MASK(cap_num);
	data[index].effective \|= mask;
	data[index].permitted \|= mask;
	}

	return sys_capset(&hdr, data) == 0;
	}

	// static
	bool Credentials::SetCapabilities(int proc_fd,
	const std::vector<Capability>& caps) {
	DCHECK_LE(0, proc_fd);

	#if !defined(THREAD_SANITIZER)
	// With TSAN, accept to break the security model as it is a testing
	// configuration.
	CHECK(ThreadHelpers::IsSingleThreaded(proc_fd));
	#endif

	return SetCapabilitiesOnCurrentThread(caps);
	}

	bool Credentials::HasAnyCapability() {
	struct cap_hdr hdr = {};
	hdr.version = _LINUX_CAPABILITY_VERSION_3;
	struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};

	PCHECK(sys_capget(&hdr, data) == 0);

	for (size_t i = 0; i < base::size(data); ++i) {
	if (data[i].effective \|\| data[i].permitted \|\| data[i].inheritable) {
	return true;
	}
	}

	return false;
	}

	bool Credentials::HasCapability(Capability cap) {
	struct cap_hdr hdr = {};
	hdr.version = _LINUX_CAPABILITY_VERSION_3;
	struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};

	PCHECK(sys_capget(&hdr, data) == 0);

	const int cap_num = CapabilityToKernelValue(cap);
	const size_t index = CAP_TO_INDEX(cap_num);
	const uint32_t mask = CAP_TO_MASK(cap_num);

	return (data[index].effective \| data[index].permitted \|
	data[index].inheritable) &
	mask;
	}

	// static
	bool Credentials::CanCreateProcessInNewUserNS() {
	#if defined(THREAD_SANITIZER)
	// With TSAN, processes will always have threads running and can never
	// enter a new user namespace with MoveToNewUserNS().
	return false;
	#else
	uid_t uid;
	gid_t gid;
	if (!GetRESIds(&uid, &gid)) {
	return false;
	}

	const pid_t pid =
	base::ForkWithFlags(CLONE_NEWUSER \| SIGCHLD, nullptr, nullptr);

	if (pid == -1) {
	CheckCloneNewUserErrno(errno);
	return false;
	}

	// The parent process could have had threads. In the child, these threads
	// have disappeared.
	if (pid == 0) {
	// unshare() requires the effective uid and gid to have a mapping in the
	// parent namespace.
	if (!SetGidAndUidMaps(gid, uid))
	_exit(kExitFailure);

	// Make sure we drop CAP_SYS_ADMIN.
	CHECK(sandbox::Credentials::DropAllCapabilities());

	// Ensure we have unprivileged use of CLONE_NEWUSER. Debian
	// Jessie explicitly forbids this case. See:
	// add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch
	if (sys_unshare(CLONE_NEWUSER))
	_exit(kExitFailure);

	_exit(kExitSuccess);
	}

	// Always reap the child.
	int status = -1;
	PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid);

	DCHECK(WIFEXITED(status) && (WEXITSTATUS(status) == kExitSuccess \|\|
	WEXITSTATUS(status) == kExitFailure));

	// clone(2) succeeded. Now return true only if the system grants
	// unprivileged use of CLONE_NEWUSER as well.
	return WIFEXITED(status) && WEXITSTATUS(status) == kExitSuccess;
	#endif
	}

	bool Credentials::MoveToNewUserNS() {
	uid_t uid;
	gid_t gid;
	if (!GetRESIds(&uid, &gid)) {
	// If all the uids (or gids) are not equal to each other, the security
	// model will most likely confuse the caller, abort.
	DVLOG(1) << "uids or gids differ!";
	return false;
	}
	int ret = sys_unshare(CLONE_NEWUSER);
	if (ret) {
	const int unshare_errno = errno;
	VLOG(1) << "Looks like unprivileged CLONE_NEWUSER may not be available "
	<< "on this kernel.";
	CheckCloneNewUserErrno(unshare_errno);
	return false;
	}

	// The current {r,e,s}{u,g}id is now an overflow id (c.f.
	// /proc/sys/kernel/overflowuid). Setup the uid and gid maps.
	PCHECK(SetGidAndUidMaps(gid, uid));
	return true;
	}

	bool Credentials::DropFileSystemAccess(int proc_fd) {
	CHECK_LE(0, proc_fd);

	CHECK(ChrootToSafeEmptyDir());
	CHECK(!HasFileSystemAccess());
	CHECK(!ProcUtil::HasOpenDirectory(proc_fd));
	// We never let this function fail.
	return true;
	}

	bool Credentials::HasFileSystemAccess() {
	return base::DirectoryExists(base::FilePath("/proc"));
	}

	pid_t Credentials::ForkAndDropCapabilitiesInChild() {
	pid_t pid = fork();
	if (pid != 0) {
	return pid;
	}

	// Since we just forked, we are single threaded.
	PCHECK(DropAllCapabilitiesOnCurrentThread());
	return 0;
	}

	} // namespace sandbox.