| # Copyright 2017 The ChromiumOS Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| description "Start the VM concierge service" |
| author "chromium-os-dev@chromium.org" |
| |
| # Start the VM concierge service, which is responsible for managing all the |
| # VMs running in the system. |
| |
| start on start-user-session |
| stop on stopping ui |
| respawn |
| |
| # Upstart expects the main process to call fork(2) twice. |
| # This means that it's not allowed to execute external commands in the script |
| # stanza other than the main process. |
| # |
| # Note that ChromeOS's upstart doesn't care processes that has forked and |
| # execed because of https://crrev.com/c/3097231. This behavior was implemented |
| # to support nesting minijail0. See https://crbug.com/1240118 for the detailed |
| # background. |
| expect daemon |
| |
| # VM concierge is a restartable daemon. |
| oom score -100 |
| |
| tmpfiles /usr/lib/tmpfiles.d/arcvm.conf /usr/lib/tmpfiles.d/vm_tools.conf |
| |
| # Camera libraries are mounted here at runtime. |
| env CAMERA_LIBFS_DIR=/usr/share/cros-camera/libfs |
| |
| # Force gRPC to use the native resolver instead of ares. |
| # TODO(crbug.com/1044665): Remove once gRPC doesn't use ares resolver for vsock. |
| env GRPC_DNS_RESOLVER=native |
| |
| # TODO: Remove once the Perfetto SDK is updated beyond v9.0 virglrenderer and |
| # other client code -- the new SDK has this path builtin. |
| env PERFETTO_PRODUCER_SOCK_NAME=/run/perfetto/traced-producer.sock |
| |
| # Give any running VMs enough time to attempt an orderly shutdown. |
| kill timeout 30 |
| |
| # The virtio-fs device opens a lot of fds and so the whole process tree needs a |
| # much higher limit. |
| limit nofile 1024 262144 |
| |
| # Give rtprio to vms to set threads to real-time priority. |
| # 10 is for audio client threads in AC'97 device. |
| limit rtprio 10 10 |
| |
| pre-start script |
| # If the pre-start script fails, by default nothing gets written to |
| # the logs. Make sure we can find out what went wrong by |
| # re-directing stderr to a temp file and sending it to the syslog if |
| # we encounter an error. |
| tmp_log="$(mktemp)" |
| exec 2>"${tmp_log}" |
| dump_log() { |
| if [ $? != 0 ]; then |
| set +x |
| # Split up lines before calling logger to avoid running into line limits. |
| xargs -n 1 -d"\n" logger --tag "${UPSTART_JOB}" < "${tmp_log}" |
| fi |
| } |
| trap dump_log EXIT |
| set -x |
| |
| # Make /run/arcvm a shared mountpoint. |
| mount --no-canonicalize --bind /run/arcvm /run/arcvm |
| mount --no-canonicalize --make-shared /run/arcvm |
| |
| # Create persistent mount namespace at /run/namespaces/mnt_concierge. |
| unshare --mount=/run/namespaces/mnt_concierge --propagation unchanged \ |
| -- /bin/true |
| # Remount the bind-mount as 'noexec'. |
| mount --no-canonicalize /run/namespaces/mnt_concierge -o remount,noexec |
| |
| # Prevent mounts at /run/arcvm from propagating out of the namespace. |
| nsenter --mount=/run/namespaces/mnt_concierge --no-fork \ |
| -- bash -c "mount --no-canonicalize --make-slave /run/arcvm && \ |
| mount --no-canonicalize --make-shared /run/arcvm" |
| |
| nsenter --mount=/run/namespaces/mnt_concierge --no-fork \ |
| -- mount --no-canonicalize --bind \ |
| /mnt/stateful_partition/unencrypted/apkcache /run/arcvm/apkcache |
| |
| # The runtime directory for MyFiles, removable media, and virtual-files in |
| # ARCVM. UID and GID correspond to Android's root and system, respectively. |
| nsenter --mount=/run/namespaces/mnt_concierge --no-fork \ |
| -- mount --no-canonicalize -o \ |
| rw,nodev,noexec,nosuid,mode=0555,uid=655360,gid=656360 -t tmpfs \ |
| tmpfs /run/arcvm/media |
| |
| if crossystem "cros_debug?1"; then |
| touch /run/vm/dev_mode # croslint: disable |
| fi |
| end script |
| |
| # We MUST not fork child processes other than the actual daemon (= minijail0) |
| # because `expect daemon` is set. |
| # So calling external commands (e.g. chown) is disallowed in this |
| # script stanza. Otherwise, the upstart won't track vm_concierge PID so |
| # vm_concierge won't be killed when a user sign out. |
| script |
| set -- |
| |
| # Handle optional bind mounts. |
| |
| # Devices. |
| if [ -c "/dev/mali0" ]; then |
| set -- "$@" -b /dev/mali0,,1 |
| fi |
| if [ -c "/dev/pvr_sync" ]; then |
| set -- "$@" -b /dev/pvr_sync,,1 |
| fi |
| # (crbug.com/892806): remove check when udmabuf is built for all target kernels. |
| if [ -c "/dev/udmabuf" ]; then |
| set -- "$@" -b /dev/udmabuf,,1 |
| fi |
| |
| # /run subpaths. |
| if [ -d "/run/camera" ]; then |
| set -- "$@" -b /run/camera,,1 |
| fi |
| if [ -d "/run/cups_proxy" ]; then |
| set -- "$@" -b /run/cups_proxy,,1 |
| fi |
| if [ -d "/run/perfetto" ]; then |
| set -- "$@" -b /run/perfetto,,1 |
| fi |
| if [ -d "/run/pvm" ]; then |
| set -- "$@" -b /run/pvm,,1 |
| fi |
| |
| # Bind mounts under /run/arcvm/ro, the read-only shared directory. |
| # (go/arcvm-shared-dirs) |
| if [ -d "/lib64" ]; then |
| # There's 32 bit systems that don't run ARCVM, don't error out if those |
| # directories don't exist. |
| set -- "$@" -b /lib64,/run/arcvm/ro/lib64 |
| set -- "$@" -b /usr/lib64,/run/arcvm/ro/usr/lib64 |
| fi |
| set -- "$@" -b /usr/share/fonts,/run/arcvm/ro/fonts |
| if [ -f "/usr/bin/vshd" ]; then |
| set -- "$@" -b /usr/bin/vshd,/run/arcvm/ro/usr/bin/vshd |
| fi |
| if [ -f "/sbin/ureadahead" ]; then |
| set -- "$@" -b /sbin/ureadahead,/run/arcvm/ro/sbin/ureadahead |
| fi |
| |
| # For vmm-swap feature. |
| if [ -c "/dev/userfaultfd" ]; then |
| set -- "$@" -b /dev/userfaultfd |
| fi |
| |
| # Handle dev-mode only bind mounts. |
| if [ -f "/run/vm/dev_mode" ]; then |
| if [ -d "/usr/local/lib64" ]; then |
| set -- "$@" -b /usr/local/lib64,/run/arcvm/ro/usr/local/lib64 |
| fi |
| if [ -f "/usr/local/bin/strace" ]; then |
| set -- "$@" -b /usr/local/bin/strace,/run/arcvm/ro/usr/local/bin/strace |
| fi |
| if [ -f "/usr/local/bin/stress-ng" ]; then |
| set -- "$@" -b /usr/local/bin/stress-ng,/run/arcvm/ro/usr/local/bin/stress-ng |
| fi |
| if [ -f "/usr/local/bin/trace-cmd" ]; then |
| set -- "$@" -b /usr/local/bin/trace-cmd,/run/arcvm/ro/usr/local/bin/trace-cmd |
| fi |
| |
| if [ -d "/opt/google/vms/android" ]; then |
| set -- "$@" -b /opt/google/vms/android,,1 |
| fi |
| if [ -d "/usr/local/vms" ] || [ -d "/usr/local/bin" ]; then |
| set -- "$@" -k 'local,/usr/local,tmpfs,MS_NOSUID|MS_NODEV|MS_NOEXEC' |
| if [ -d "/usr/local/vms" ]; then |
| set -- "$@" -b /usr/local/vms,,1 |
| fi |
| if [ -d "/usr/local/bin" ]; then |
| set -- "$@" -b /usr/local/bin |
| fi |
| if [ -d "/usr/local/lib" ]; then |
| set -- "$@" -b /usr/local/lib |
| fi |
| if [ -d "/usr/local/lib64" ]; then |
| set -- "$@" -b /usr/local/lib64 |
| fi |
| fi |
| fi |
| |
| # Allow write access to vfio by bind mounting /dev/vfio iff: |
| # - NVIDIA dGPU device is detected and is bound to vfio module. |
| # |
| # Since this is running in the script stanza, we are not allowed to fork or |
| # spawn new processes so we need to use only builtin functions instead. |
| if [ -c "/dev/vfio/vfio" ]; then |
| for f in /sys/bus/pci/devices/*/ ; do |
| if [ -f "${f}/device" ]; then |
| read -r device < "${f}/device" |
| read -r vendor < "${f}/vendor" |
| read -r class < "${f}/class" |
| |
| boot_vga=1 |
| if [ "${class}" = "0x030000" ]; then |
| read -r boot_vga < "${f}/boot_vga" |
| fi |
| |
| # Check if NVIDIA dGPU device is detected. |
| # class "0x030200" = 3D Controller; |
| # class "0x030000" = VGA device; |
| if [ "${vendor}" = "0x10de" ] && |
| ([ "${class}" = "0x030200" ] || |
| ([ "${class}" = "0x030000" ] && |
| [ "${boot_vga}" = "0" ])); then |
| set -- "$@" -b /dev/vfio,/dev/vfio,1 |
| fi |
| fi |
| done |
| fi |
| |
| # Only mount shadercached daemon-store if it exists |
| if [ -d /run/daemon-store/shadercached ]; then |
| set -- "$@" -k "/run/daemon-store/shadercached,\ |
| /run/daemon-store/shadercached,none,MS_BIND|MS_REC" |
| fi |
| |
| # Concierge needs to be able to configure user namespaces due to the |
| # way ARC VM shares files with the host. This requires CAP_SETUID |
| # and CAP_SETGID, but we can restrict the impact of this by running |
| # concierge in an outer user namespace which only maps the uids |
| # needed for the inner namespaces (except that concierge itself |
| # needs access to a number of groups). We also need to map a number |
| # of groups for concierge itself. The mapping does not renumber any |
| # uids or gids to make the inner mappings easier to understand, |
| # except that we are forced to map something to 0, and we use |
| # crosvm-root for this purpose. |
| # |
| # uids: |
| # - crosvm (299) |
| # - shadercached (333) |
| # - crosvm-root (20182) |
| # - arc-camera (603, but we have to map 600-649 for unclear reasons) |
| # - nobody (65534) |
| # - Android (2,000,000 starting from 655360) |
| # |
| # gids: |
| # - all of the above uids except crosvm-dbus, plus |
| # - video (27) |
| # - daemon-store (400) |
| # - tun (413) |
| # - virtaccess (418) |
| # - cras (600) |
| # - wayland (601) |
| # - android-reserved-disk (20119) |
| # - pluginvm (20128) |
| # - cups-proxy (20136) |
| # - traced-producer (20162) |
| # |
| # /proc is also remounted read-write because crosvm needs to be able to set the |
| # uid_map and gid_map for its child processes and that needs a writable /proc. |
| # |
| # -Kslave is applied to propagate imageloader mounts into concierge's mount |
| # namespace. |
| |
| uid_map="0 20182 1,299 299 1,333 333 1,600 600 50,65534 65534 1,\ |
| 655360 655360 2000000" |
| gid_map="0 20182 1,27 27 1,299 299 1,333 333 1,400 400 1,413 413 1,418 418 1,\ |
| 600 600 50,20119 20119 1,20128 20128 1,20136 20136 1,20162 20162 1,\ |
| 65534 65534 1,655360 655360 2000000" |
| |
| exec nsenter --mount=/run/namespaces/mnt_concierge --no-fork \ |
| -- minijail0 -nlvd -i -t --uts \ |
| -Kslave \ |
| -P /mnt/empty \ |
| -b /,/ \ |
| -k 'proc,/proc,proc,MS_NOSUID|MS_NODEV|MS_NOEXEC' \ |
| -b /sys,/sys \ |
| -b /sys/devices,/sys/devices,1 \ |
| -b /sys/kernel/tracing,,1 \ |
| -k 'tmpfs,/sys/fs/cgroup,tmpfs,MS_NODEV|MS_NOEXEC|MS_NOSUID,mode=755,size=10M' \ |
| -b /sys/fs/cgroup/cpu,,1 \ |
| -b /sys/fs/cgroup/cpuset,,1 \ |
| -b /dev/log,/dev/log,1 \ |
| -b /dev/kvm,/dev/kvm,1 \ |
| -b /dev/mapper/vm,/dev/mapper/vm,1 \ |
| -b /dev/net,/dev/net,1 \ |
| -b /dev/vhost-vsock,/dev/vhost-vsock,1 \ |
| -b /dev/dri,/dev/dri,1 \ |
| -k 'run,/run,tmpfs,MS_NOSUID|MS_NODEV|MS_NOEXEC' \ |
| -b "${CAMERA_LIBFS_DIR}" \ |
| -b /run/chrome,/run/chrome,1 \ |
| -b /run/chromeos-config/v1/ \ |
| -b /run/cras/vms,/run/cras,1 \ |
| -b /run/dbus,/run/dbus,1 \ |
| -b /run/mojo,,1 \ |
| -b /run/vm,/run/vm,1 \ |
| -b /run/vm_cicerone/client,/run/vm_cicerone/client,1 \ |
| -k /run/imageloader,/run/imageloader,none,0x5000 \ |
| -k 'var,/var,tmpfs,MS_NOSUID|MS_NODEV|MS_NOEXEC' \ |
| -b /var/lib/metrics,,1 \ |
| -b /var/lib/timezone \ |
| -b /etc/os-release \ |
| -b /var/lib/vm_cicerone/metrics,/var/lib/vm_cicerone/metrics,1 \ |
| -b /var/lib/vm_concierge/vmm_swap_policy,,1 \ |
| -k '/run/daemon-store/crosvm,/run/daemon-store/crosvm,none,MS_BIND|MS_REC' \ |
| -k '/run/daemon-store/pvm,/run/daemon-store/pvm,none,MS_BIND|MS_REC' \ |
| -k '/run/arcvm,/run/arcvm,none,MS_BIND|MS_REC' \ |
| -b '/var/lib/metrics/structured,,1' \ |
| -b '/var/lib/metrics/structured/events,,1' \ |
| -R RLIMIT_MEMLOCK,unlimited,unlimited \ |
| "$@" \ |
| -- /sbin/minijail0 -U -pv -I -i \ |
| -Kslave \ |
| -c 'cap_setuid,cap_setgid,cap_ipc_lock+eip' --ambient \ |
| -k 'proc,/proc,proc,MS_NOSUID|MS_NODEV|MS_NOEXEC' \ |
| -m"${uid_map}" \ |
| -M"${gid_map}" \ |
| /usr/bin/vm_concierge |
| end script |
| |
| post-stop script |
| if mountpoint -q /run/namespaces/mnt_concierge; then |
| umount /run/namespaces/mnt_concierge |
| fi |
| if mountpoint -q /run/arcvm; then |
| umount /run/arcvm |
| fi |
| end script |