blob: 809f2501d86da5ae83d069a05ef3a2e8460cab7c [file] [log] [blame]
# Copyright 2017 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
description "Start the VM concierge service"
author "chromium-os-dev@chromium.org"
# Start the VM concierge service, which is responsible for managing all the
# VMs running in the system.
start on start-user-session
stop on stopping ui
respawn
# Upstart expects the main process to call fork(2) twice.
# This means that it's not allowed to execute external commands in the script
# stanza other than the main process.
#
# Note that ChromeOS's upstart doesn't care processes that has forked and
# execed because of https://crrev.com/c/3097231. This behavior was implemented
# to support nesting minijail0. See https://crbug.com/1240118 for the detailed
# background.
expect daemon
# VM concierge is a restartable daemon.
oom score -100
tmpfiles /usr/lib/tmpfiles.d/arcvm.conf /usr/lib/tmpfiles.d/vm_tools.conf
# Camera libraries are mounted here at runtime.
env CAMERA_LIBFS_DIR=/usr/share/cros-camera/libfs
# Force gRPC to use the native resolver instead of ares.
# TODO(crbug.com/1044665): Remove once gRPC doesn't use ares resolver for vsock.
env GRPC_DNS_RESOLVER=native
# TODO: Remove once the Perfetto SDK is updated beyond v9.0 virglrenderer and
# other client code -- the new SDK has this path builtin.
env PERFETTO_PRODUCER_SOCK_NAME=/run/perfetto/traced-producer.sock
# Give any running VMs enough time to attempt an orderly shutdown.
kill timeout 30
# The virtio-fs device opens a lot of fds and so the whole process tree needs a
# much higher limit.
limit nofile 1024 262144
# Give rtprio to vms to set threads to real-time priority.
# 10 is for audio client threads in AC'97 device.
limit rtprio 10 10
pre-start script
# If the pre-start script fails, by default nothing gets written to
# the logs. Make sure we can find out what went wrong by
# re-directing stderr to a temp file and sending it to the syslog if
# we encounter an error.
tmp_log="$(mktemp)"
exec 2>"${tmp_log}"
dump_log() {
if [ $? != 0 ]; then
set +x
# Split up lines before calling logger to avoid running into line limits.
xargs -n 1 -d"\n" logger --tag "${UPSTART_JOB}" < "${tmp_log}"
fi
}
trap dump_log EXIT
set -x
# Make /run/arcvm a shared mountpoint.
mount --no-canonicalize --bind /run/arcvm /run/arcvm
mount --no-canonicalize --make-shared /run/arcvm
# Create persistent mount namespace at /run/namespaces/mnt_concierge.
unshare --mount=/run/namespaces/mnt_concierge --propagation unchanged \
-- /bin/true
# Remount the bind-mount as 'noexec'.
mount --no-canonicalize /run/namespaces/mnt_concierge -o remount,noexec
# Prevent mounts at /run/arcvm from propagating out of the namespace.
nsenter --mount=/run/namespaces/mnt_concierge --no-fork \
-- bash -c "mount --no-canonicalize --make-slave /run/arcvm && \
mount --no-canonicalize --make-shared /run/arcvm"
nsenter --mount=/run/namespaces/mnt_concierge --no-fork \
-- mount --no-canonicalize --bind \
/mnt/stateful_partition/unencrypted/apkcache /run/arcvm/apkcache
# The runtime directory for MyFiles, removable media, and virtual-files in
# ARCVM. UID and GID correspond to Android's root and system, respectively.
nsenter --mount=/run/namespaces/mnt_concierge --no-fork \
-- mount --no-canonicalize -o \
rw,nodev,noexec,nosuid,mode=0555,uid=655360,gid=656360 -t tmpfs \
tmpfs /run/arcvm/media
if crossystem "cros_debug?1"; then
touch /run/vm/dev_mode # croslint: disable
fi
end script
# We MUST not fork child processes other than the actual daemon (= minijail0)
# because `expect daemon` is set.
# So calling external commands (e.g. chown) is disallowed in this
# script stanza. Otherwise, the upstart won't track vm_concierge PID so
# vm_concierge won't be killed when a user sign out.
script
set --
# Handle optional bind mounts.
# Devices.
if [ -c "/dev/mali0" ]; then
set -- "$@" -b /dev/mali0,,1
fi
if [ -c "/dev/pvr_sync" ]; then
set -- "$@" -b /dev/pvr_sync,,1
fi
# (crbug.com/892806): remove check when udmabuf is built for all target kernels.
if [ -c "/dev/udmabuf" ]; then
set -- "$@" -b /dev/udmabuf,,1
fi
# /run subpaths.
if [ -d "/run/camera" ]; then
set -- "$@" -b /run/camera,,1
fi
if [ -d "/run/cups_proxy" ]; then
set -- "$@" -b /run/cups_proxy,,1
fi
if [ -d "/run/perfetto" ]; then
set -- "$@" -b /run/perfetto,,1
fi
if [ -d "/run/pvm" ]; then
set -- "$@" -b /run/pvm,,1
fi
# Bind mounts under /run/arcvm/ro, the read-only shared directory.
# (go/arcvm-shared-dirs)
if [ -d "/lib64" ]; then
# There's 32 bit systems that don't run ARCVM, don't error out if those
# directories don't exist.
set -- "$@" -b /lib64,/run/arcvm/ro/lib64
set -- "$@" -b /usr/lib64,/run/arcvm/ro/usr/lib64
fi
set -- "$@" -b /usr/share/fonts,/run/arcvm/ro/fonts
if [ -f "/usr/bin/vshd" ]; then
set -- "$@" -b /usr/bin/vshd,/run/arcvm/ro/usr/bin/vshd
fi
if [ -f "/sbin/ureadahead" ]; then
set -- "$@" -b /sbin/ureadahead,/run/arcvm/ro/sbin/ureadahead
fi
# For vmm-swap feature.
if [ -c "/dev/userfaultfd" ]; then
set -- "$@" -b /dev/userfaultfd
fi
# Handle dev-mode only bind mounts.
if [ -f "/run/vm/dev_mode" ]; then
if [ -d "/usr/local/lib64" ]; then
set -- "$@" -b /usr/local/lib64,/run/arcvm/ro/usr/local/lib64
fi
if [ -f "/usr/local/bin/strace" ]; then
set -- "$@" -b /usr/local/bin/strace,/run/arcvm/ro/usr/local/bin/strace
fi
if [ -f "/usr/local/bin/stress-ng" ]; then
set -- "$@" -b /usr/local/bin/stress-ng,/run/arcvm/ro/usr/local/bin/stress-ng
fi
if [ -f "/usr/local/bin/trace-cmd" ]; then
set -- "$@" -b /usr/local/bin/trace-cmd,/run/arcvm/ro/usr/local/bin/trace-cmd
fi
if [ -d "/opt/google/vms/android" ]; then
set -- "$@" -b /opt/google/vms/android,,1
fi
if [ -d "/usr/local/vms" ] || [ -d "/usr/local/bin" ]; then
set -- "$@" -k 'local,/usr/local,tmpfs,MS_NOSUID|MS_NODEV|MS_NOEXEC'
if [ -d "/usr/local/vms" ]; then
set -- "$@" -b /usr/local/vms,,1
fi
if [ -d "/usr/local/bin" ]; then
set -- "$@" -b /usr/local/bin
fi
if [ -d "/usr/local/lib" ]; then
set -- "$@" -b /usr/local/lib
fi
if [ -d "/usr/local/lib64" ]; then
set -- "$@" -b /usr/local/lib64
fi
fi
fi
# Allow write access to vfio by bind mounting /dev/vfio iff:
# - NVIDIA dGPU device is detected and is bound to vfio module.
#
# Since this is running in the script stanza, we are not allowed to fork or
# spawn new processes so we need to use only builtin functions instead.
if [ -c "/dev/vfio/vfio" ]; then
for f in /sys/bus/pci/devices/*/ ; do
if [ -f "${f}/device" ]; then
read -r device < "${f}/device"
read -r vendor < "${f}/vendor"
read -r class < "${f}/class"
boot_vga=1
if [ "${class}" = "0x030000" ]; then
read -r boot_vga < "${f}/boot_vga"
fi
# Check if NVIDIA dGPU device is detected.
# class "0x030200" = 3D Controller;
# class "0x030000" = VGA device;
if [ "${vendor}" = "0x10de" ] &&
([ "${class}" = "0x030200" ] ||
([ "${class}" = "0x030000" ] &&
[ "${boot_vga}" = "0" ])); then
set -- "$@" -b /dev/vfio,/dev/vfio,1
fi
fi
done
fi
# Only mount shadercached daemon-store if it exists
if [ -d /run/daemon-store/shadercached ]; then
set -- "$@" -k "/run/daemon-store/shadercached,\
/run/daemon-store/shadercached,none,MS_BIND|MS_REC"
fi
# Concierge needs to be able to configure user namespaces due to the
# way ARC VM shares files with the host. This requires CAP_SETUID
# and CAP_SETGID, but we can restrict the impact of this by running
# concierge in an outer user namespace which only maps the uids
# needed for the inner namespaces (except that concierge itself
# needs access to a number of groups). We also need to map a number
# of groups for concierge itself. The mapping does not renumber any
# uids or gids to make the inner mappings easier to understand,
# except that we are forced to map something to 0, and we use
# crosvm-root for this purpose.
#
# uids:
# - crosvm (299)
# - shadercached (333)
# - crosvm-root (20182)
# - arc-camera (603, but we have to map 600-649 for unclear reasons)
# - nobody (65534)
# - Android (2,000,000 starting from 655360)
#
# gids:
# - all of the above uids except crosvm-dbus, plus
# - video (27)
# - daemon-store (400)
# - tun (413)
# - virtaccess (418)
# - cras (600)
# - wayland (601)
# - android-reserved-disk (20119)
# - pluginvm (20128)
# - cups-proxy (20136)
# - traced-producer (20162)
#
# /proc is also remounted read-write because crosvm needs to be able to set the
# uid_map and gid_map for its child processes and that needs a writable /proc.
#
# -Kslave is applied to propagate imageloader mounts into concierge's mount
# namespace.
uid_map="0 20182 1,299 299 1,333 333 1,600 600 50,65534 65534 1,\
655360 655360 2000000"
gid_map="0 20182 1,27 27 1,299 299 1,333 333 1,400 400 1,413 413 1,418 418 1,\
600 600 50,20119 20119 1,20128 20128 1,20136 20136 1,20162 20162 1,\
65534 65534 1,655360 655360 2000000"
exec nsenter --mount=/run/namespaces/mnt_concierge --no-fork \
-- minijail0 -nlvd -i -t --uts \
-Kslave \
-P /mnt/empty \
-b /,/ \
-k 'proc,/proc,proc,MS_NOSUID|MS_NODEV|MS_NOEXEC' \
-b /sys,/sys \
-b /sys/devices,/sys/devices,1 \
-b /sys/kernel/tracing,,1 \
-k 'tmpfs,/sys/fs/cgroup,tmpfs,MS_NODEV|MS_NOEXEC|MS_NOSUID,mode=755,size=10M' \
-b /sys/fs/cgroup/cpu,,1 \
-b /sys/fs/cgroup/cpuset,,1 \
-b /dev/log,/dev/log,1 \
-b /dev/kvm,/dev/kvm,1 \
-b /dev/mapper/vm,/dev/mapper/vm,1 \
-b /dev/net,/dev/net,1 \
-b /dev/vhost-vsock,/dev/vhost-vsock,1 \
-b /dev/dri,/dev/dri,1 \
-k 'run,/run,tmpfs,MS_NOSUID|MS_NODEV|MS_NOEXEC' \
-b "${CAMERA_LIBFS_DIR}" \
-b /run/chrome,/run/chrome,1 \
-b /run/chromeos-config/v1/ \
-b /run/cras/vms,/run/cras,1 \
-b /run/dbus,/run/dbus,1 \
-b /run/mojo,,1 \
-b /run/vm,/run/vm,1 \
-b /run/vm_cicerone/client,/run/vm_cicerone/client,1 \
-k /run/imageloader,/run/imageloader,none,0x5000 \
-k 'var,/var,tmpfs,MS_NOSUID|MS_NODEV|MS_NOEXEC' \
-b /var/lib/metrics,,1 \
-b /var/lib/timezone \
-b /etc/os-release \
-b /var/lib/vm_cicerone/metrics,/var/lib/vm_cicerone/metrics,1 \
-b /var/lib/vm_concierge/vmm_swap_policy,,1 \
-k '/run/daemon-store/crosvm,/run/daemon-store/crosvm,none,MS_BIND|MS_REC' \
-k '/run/daemon-store/pvm,/run/daemon-store/pvm,none,MS_BIND|MS_REC' \
-k '/run/arcvm,/run/arcvm,none,MS_BIND|MS_REC' \
-b '/var/lib/metrics/structured,,1' \
-b '/var/lib/metrics/structured/events,,1' \
-R RLIMIT_MEMLOCK,unlimited,unlimited \
"$@" \
-- /sbin/minijail0 -U -pv -I -i \
-Kslave \
-c 'cap_setuid,cap_setgid,cap_ipc_lock+eip' --ambient \
-k 'proc,/proc,proc,MS_NOSUID|MS_NODEV|MS_NOEXEC' \
-m"${uid_map}" \
-M"${gid_map}" \
/usr/bin/vm_concierge
end script
post-stop script
if mountpoint -q /run/namespaces/mnt_concierge; then
umount /run/namespaces/mnt_concierge
fi
if mountpoint -q /run/arcvm; then
umount /run/arcvm
fi
end script