// Copyright 2022 The ChromiumOS Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//! vmwdt is a virtual watchdog memory mapped device which detects stalls
//! on the vCPUs and resets the guest when no 'pet' events are received.
//! <>
use std::collections::BTreeMap;
use std::convert::TryFrom;
use std::fs;
use std::sync::Arc;
use std::time::Duration;
use anyhow::Context;
use base::custom_serde::serialize_arc_mutex;
use base::debug;
use base::error;
use base::warn;
use base::AsRawDescriptor;
use base::Descriptor;
use base::Error as SysError;
use base::Event;
use base::EventToken;
use base::SendTube;
use base::Timer;
use base::TimerTrait;
use base::Tube;
use base::VmEventType;
use base::WaitContext;
use base::WorkerThread;
use serde::Deserialize;
use serde::Serialize;
use sync::Mutex;
use vm_control::VmResponse;
use crate::pci::CrosvmDeviceId;
use crate::BusAccessInfo;
use crate::BusDevice;
use crate::DeviceId;
use crate::IrqEdgeEvent;
use crate::Suspendable;
// Registers offsets
const VMWDT_REG_STATUS: u32 = 0x00;
const VMWDT_REG_LOAD_CNT: u32 = 0x04;
const VMWDT_REG_CURRENT_CNT: u32 = 0x08;
const VMWDT_REG_CLOCK_FREQ_HZ: u32 = 0x0C;
// Length of the registers
const VMWDT_REG_LEN: u64 = 0x10;
pub const VMWDT_DEFAULT_TIMEOUT_SEC: u32 = 10;
pub const VMWDT_DEFAULT_CLOCK_HZ: u32 = 2;
// Proc stat indexes
const PROCSTAT_GUEST_TIME_INDX: usize = 42;
pub struct VmwdtPerCpu {
// Flag which indicated if the watchdog is started
is_enabled: bool,
// Timer used to generate periodic events at `timer_freq_hz` frequency
timer: Timer,
// The frequency of the `timer`
timer_freq_hz: u64,
// Timestamp measured in miliseconds of the last guest activity
last_guest_time_ms: i64,
// The thread_id of the thread this vcpu belongs to
thread_id: u32,
// The process id of the task this vcpu belongs to
process_id: u32,
// The pre-programmed one-shot expiration interval. If the guest runs in this
// interval but we don't receive a periodic event, the guest is stalled.
next_expiration_interval_ms: i64,
// Keep track if the watchdog PPI raised.
stall_evt_ppi_triggered: bool,
// Keep track if the time was armed with oneshot mode or with repeating interval
repeating_interval: Option<Duration>,
struct VmwdtPerCpuRestore {
is_enabled: bool,
timer_freq_hz: u64,
last_guest_time_ms: i64,
next_expiration_interval_ms: i64,
repeating_interval: Option<Duration>,
pub struct Vmwdt {
vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>,
// The worker thread that waits on the timer fd
worker_thread: Option<WorkerThread<Tube>>,
// TODO: @sebastianene add separate reset event for the watchdog
// Reset source if the device is not responding
reset_evt_wrtube: SendTube,
activated: bool,
// Event to be used to interrupt the guest on detected stalls
stall_evt: IrqEdgeEvent,
vm_ctrl_tube: Option<Tube>,
struct VmwdtSnapshot {
#[serde(serialize_with = "serialize_arc_mutex")]
vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>,
activated: bool,
struct VmwdtRestore {
vm_wdts: Vec<VmwdtPerCpuRestore>,
activated: bool,
impl Vmwdt {
pub fn new(
cpu_count: usize,
reset_evt_wrtube: SendTube,
evt: IrqEdgeEvent,
vm_ctrl_tube: Tube,
) -> anyhow::Result<Vmwdt> {
let mut vec = Vec::new();
for _ in 0..cpu_count {
vec.push(VmwdtPerCpu {
last_guest_time_ms: 0,
thread_id: 0,
process_id: 0,
is_enabled: false,
stall_evt_ppi_triggered: false,
timer: Timer::new().context("failed to create Timer")?,
timer_freq_hz: 0,
next_expiration_interval_ms: 0,
repeating_interval: None,
let vm_wdts = Arc::new(Mutex::new(vec));
Ok(Vmwdt {
worker_thread: None,
activated: false,
stall_evt: evt,
vm_ctrl_tube: Some(vm_ctrl_tube),
pub fn vmwdt_worker_thread(
vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>,
kill_evt: Event,
reset_evt_wrtube: SendTube,
stall_evt: IrqEdgeEvent,
vm_ctrl_tube: Tube,
worker_started_send: Option<SendTube>,
) -> anyhow::Result<Tube> {
let msg = vm_control::VmRequest::VcpuPidTid;
.context("failed to send request to fetch Vcpus PID and TID")?;
let vcpus_pid_tid: BTreeMap<usize, (u32, u32)> = match vm_ctrl_tube
.context("failed to receive vmwdt pids and tids")?
VmResponse::VcpuPidTidResponse { pid_tid_map } => pid_tid_map,
_ => {
return Err(anyhow::anyhow!(
"Receive incorrect message type when trying to get vcpu pid tid map"
let mut vm_wdts = vm_wdts.lock();
for (i, vmwdt) in (*vm_wdts).iter_mut().enumerate() {
let pid_tid = vcpus_pid_tid
.context("vmwdts empty, which could indicate no vcpus are initialized")?;
vmwdt.process_id = pid_tid.0;
vmwdt.thread_id = pid_tid.1;
if let Some(worker_started_send) = worker_started_send {
.context("failed to send vmwdt worker started")?;
enum Token {
let wait_ctx: WaitContext<Token> =
WaitContext::new().context("Failed to create wait_ctx")?;
.add(&kill_evt, Token::Kill)
.context("Failed to add Tokens to wait_ctx")?;
let len = vm_wdts.lock().len();
for clock_id in 0..len {
let timer_fd = vm_wdts.lock()[clock_id].timer.as_raw_descriptor();
.add(&Descriptor(timer_fd), Token::Timer(clock_id))
.context("Failed to link FDs to Tokens")?;
loop {
let events = wait_ctx.wait().context("Failed to wait for events")?;
for event in events.iter().filter(|e| e.is_readable) {
match event.token {
Token::Kill => {
return Ok(vm_ctrl_tube);
Token::Timer(cpu_id) => {
let mut wdts_locked = vm_wdts.lock();
let watchdog = &mut wdts_locked[cpu_id];
if let Err(_e) = watchdog.timer.wait() {
error!("error waiting for timer event on vcpu {}", cpu_id);
let current_guest_time_ms =
Vmwdt::get_guest_time_ms(watchdog.process_id, watchdog.thread_id)
.context("get_guest_time_ms failed")?;
let remaining_time_ms = watchdog.next_expiration_interval_ms
- (current_guest_time_ms - watchdog.last_guest_time_ms);
if remaining_time_ms > 0 {
watchdog.next_expiration_interval_ms = remaining_time_ms;
if let Err(e) = watchdog
.reset_oneshot(Duration::from_millis(remaining_time_ms as u64))
"failed to reset internal timer on vcpu {}: {:#}",
cpu_id, e
watchdog.repeating_interval = None;
} else {
if watchdog.stall_evt_ppi_triggered {
if let Err(e) = reset_evt_wrtube
error!("{} failed to send reset event from vcpu {}", e, cpu_id)
.context("Failed to trigger stall event")?;
watchdog.stall_evt_ppi_triggered = true;
watchdog.last_guest_time_ms = current_guest_time_ms;
fn start(&mut self, worker_started_send: Option<SendTube>) -> anyhow::Result<()> {
let vm_wdts = self.vm_wdts.clone();
let reset_evt_wrtube = self.reset_evt_wrtube.try_clone().unwrap();
let stall_event = self.stall_evt.try_clone().unwrap();
let vm_ctrl_tube = self
.context("missing vm control tube")?;
self.activated = true;
self.worker_thread = Some(WorkerThread::start("vmwdt worker", |kill_evt| {
.expect("failed to start vmwdt worker thread")
fn ensure_started(&mut self) {
if self.worker_thread.is_some() {
let (worker_started_send, worker_started_recv) =
Tube::directional_pair().expect("failed to create vmwdt worker started tubes");
.expect("failed to start Vmwdt");
.expect("failed to receive vmwdt worker started");
#[cfg(any(target_os = "linux", target_os = "android"))]
pub fn get_guest_time_ms(process_id: u32, thread_id: u32) -> Result<i64, SysError> {
// TODO: @sebastianene check if we can avoid open-read-close on each call
let stat_path = format!("/proc/{}/task/{}/stat", process_id, thread_id);
let contents = fs::read_to_string(stat_path)?;
let gtime_ticks = contents
.and_then(|guest_time| guest_time.parse::<u64>().ok())
// Safe because this just returns an integer
let ticks_per_sec = unsafe { libc::sysconf(libc::_SC_CLK_TCK) } as u64;
Ok((gtime_ticks * 1000 / ticks_per_sec) as i64)
#[cfg(not(any(target_os = "linux", target_os = "android")))]
pub fn get_guest_time_ms(process_id: u32, thread_id: u32) -> Result<i64, SysError> {
impl BusDevice for Vmwdt {
fn debug_label(&self) -> String {
fn device_id(&self) -> DeviceId {
fn read(&mut self, _offset: BusAccessInfo, _data: &mut [u8]) {}
fn write(&mut self, info: BusAccessInfo, data: &[u8]) {
let data_array = match <&[u8; 4]>::try_from(data) {
Ok(array) => array,
_ => {
error!("Bad write size: {} for vmwdt", data.len());
let reg_val = u32::from_ne_bytes(*data_array);
let cpu_index: usize = (info.offset / VMWDT_REG_LEN) as usize;
let reg_offset = (info.offset % VMWDT_REG_LEN) as u32;
if cpu_index > self.vm_wdts.lock().len() {
error!("Bad write cpu_index {}", cpu_index);
match reg_offset {
let mut wdts_locked = self.vm_wdts.lock();
let cpu_watchdog = &mut wdts_locked[cpu_index];
cpu_watchdog.is_enabled = reg_val != 0;
if reg_val != 0 {
let interval = Duration::from_millis(1000 / cpu_watchdog.timer_freq_hz);
cpu_watchdog.repeating_interval = Some(interval);
.expect("Failed to reset timer repeating interval");
} else {
cpu_watchdog.repeating_interval = None;
.expect("Failed to clear cpu watchdog timer");
let (process_id, thread_id) = {
let mut wdts_locked = self.vm_wdts.lock();
let cpu_watchdog = &mut wdts_locked[cpu_index];
(cpu_watchdog.process_id, cpu_watchdog.thread_id)
let guest_time_ms = Vmwdt::get_guest_time_ms(process_id, thread_id)
.expect("get_guest_time_ms failed");
let mut wdts_locked = self.vm_wdts.lock();
let cpu_watchdog = &mut wdts_locked[cpu_index];
let next_expiration_interval_ms =
reg_val as u64 * 1000 / cpu_watchdog.timer_freq_hz;
cpu_watchdog.last_guest_time_ms = guest_time_ms;
cpu_watchdog.stall_evt_ppi_triggered = false;
cpu_watchdog.next_expiration_interval_ms = next_expiration_interval_ms as i64;
if cpu_watchdog.is_enabled {
if let Err(_e) = cpu_watchdog
error!("failed to reset one-shot vcpu time {}", cpu_index);
cpu_watchdog.repeating_interval = None;
warn!("invalid write to read-only VMWDT_REG_CURRENT_CNT register");
let mut wdts_locked = self.vm_wdts.lock();
let cpu_watchdog = &mut wdts_locked[cpu_index];
"CPU:{:x} wrote VMWDT_REG_CLOCK_FREQ_HZ {:x}",
cpu_index, reg_val
cpu_watchdog.timer_freq_hz = reg_val as u64;
_ => unreachable!(),
impl Suspendable for Vmwdt {
fn sleep(&mut self) -> anyhow::Result<()> {
if let Some(worker) = self.worker_thread.take() {
self.vm_ctrl_tube = Some(worker.stop());
fn wake(&mut self) -> anyhow::Result<()> {
if self.activated {
// We do not pass a tube to notify that the worker thread has started on wake.
// At this stage, vm_control is blocked on resuming devices and cannot provide the vcpu
// PIDs/TIDs yet.
// At the same time, the Vcpus are still frozen, which means no MMIO will get
// processed, and write will not get triggered.
// The request to get PIDs/TIDs should get processed before any MMIO request occurs.
let mut vm_wdts = self.vm_wdts.lock();
for vmwdt in vm_wdts.iter_mut() {
if let Some(interval) = &vmwdt.repeating_interval {
.context("failed to write repeating interval")?;
} else if vmwdt.is_enabled {
vmwdt.next_expiration_interval_ms as u64,
.context("failed to write oneshot interval")?;
fn snapshot(&mut self) -> anyhow::Result<serde_json::Value> {
serde_json::to_value(&VmwdtSnapshot {
vm_wdts: self.vm_wdts.clone(),
activated: self.activated,
.context("failed to snapshot Vmwdt")
fn restore(&mut self, data: serde_json::Value) -> anyhow::Result<()> {
let deser: VmwdtRestore =
serde_json::from_value(data).context("failed to deserialize Vmwdt")?;
let mut vm_wdts = self.vm_wdts.lock();
for (vmwdt_restore, vmwdt) in deser.vm_wdts.iter().zip(vm_wdts.iter_mut()) {
vmwdt.is_enabled = vmwdt_restore.is_enabled;
vmwdt.timer_freq_hz = vmwdt_restore.timer_freq_hz;
vmwdt.last_guest_time_ms = vmwdt_restore.last_guest_time_ms;
vmwdt.next_expiration_interval_ms = vmwdt_restore.next_expiration_interval_ms;
vmwdt.repeating_interval = vmwdt_restore.repeating_interval;
self.activated = deser.activated;
mod tests {
use std::process;
use std::thread::sleep;
#[cfg(any(target_os = "linux", target_os = "android"))]
use base::gettid;
use base::poll_assert;
use base::Tube;
use super::*;
const AARCH64_VMWDT_ADDR: u64 = 0x3000;
const TEST_VMWDT_CPU_NO: usize = 0x1;
fn vmwdt_bus_address(offset: u64) -> BusAccessInfo {
BusAccessInfo {
address: AARCH64_VMWDT_ADDR,
id: 0,
fn test_watchdog_internal_timer() {
let (vm_evt_wrtube, _vm_evt_rdtube) = Tube::directional_pair().unwrap();
let (vm_ctrl_wrtube, vm_ctrl_rdtube) = Tube::pair().unwrap();
let irq = IrqEdgeEvent::new().unwrap();
#[cfg(any(target_os = "linux", target_os = "android"))]
.send(&VmResponse::VcpuPidTidResponse {
pid_tid_map: BTreeMap::from([(0, (process::id(), gettid() as u32))]),
let mut device = Vmwdt::new(TEST_VMWDT_CPU_NO, vm_evt_wrtube, irq, vm_ctrl_rdtube).unwrap();
// Configure the watchdog device, 2Hz internal clock
vmwdt_bus_address(VMWDT_REG_CLOCK_FREQ_HZ as u64),
&[10, 0, 0, 0],
device.write(vmwdt_bus_address(VMWDT_REG_LOAD_CNT as u64), &[1, 0, 0, 0]);
device.write(vmwdt_bus_address(VMWDT_REG_STATUS as u64), &[1, 0, 0, 0]);
let next_expiration_ms = {
let mut vmwdt_locked = device.vm_wdts.lock();
// In the test scenario the guest does not interpret the /proc/stat::guest_time, thus
// the function get_guest_time() returns 0
vmwdt_locked[0].last_guest_time_ms = 10;
// Poll multiple times as we don't get a signal when the watchdog thread has run.
poll_assert!(10, || {
let vmwdt_locked = device.vm_wdts.lock();
// Verify that our timer expired and the next_expiration_interval_ms changed
vmwdt_locked[0].next_expiration_interval_ms != next_expiration_ms
fn test_watchdog_expiration() {
let (vm_evt_wrtube, vm_evt_rdtube) = Tube::directional_pair().unwrap();
let (vm_ctrl_wrtube, vm_ctrl_rdtube) = Tube::pair().unwrap();
let irq = IrqEdgeEvent::new().unwrap();
#[cfg(any(target_os = "linux", target_os = "android"))]
.send(&VmResponse::VcpuPidTidResponse {
pid_tid_map: BTreeMap::from([(0, (process::id(), gettid() as u32))]),
let mut device = Vmwdt::new(TEST_VMWDT_CPU_NO, vm_evt_wrtube, irq, vm_ctrl_rdtube).unwrap();
// Configure the watchdog device, 2Hz internal clock
vmwdt_bus_address(VMWDT_REG_CLOCK_FREQ_HZ as u64),
&[10, 0, 0, 0],
device.write(vmwdt_bus_address(VMWDT_REG_LOAD_CNT as u64), &[1, 0, 0, 0]);
device.write(vmwdt_bus_address(VMWDT_REG_STATUS as u64), &[1, 0, 0, 0]);
// In the test scenario the guest does not interpret the /proc/stat::guest_time, thus
// the function get_guest_time() returns 0
device.vm_wdts.lock()[0].last_guest_time_ms = -100;
// Check that the interrupt has raised
poll_assert!(10, || {
let vmwdt_locked = device.vm_wdts.lock();
// Simulate that the time has passed since the last expiration
device.vm_wdts.lock()[0].last_guest_time_ms = -100;
// Poll multiple times as we don't get a signal when the watchdog thread has run.
poll_assert!(10, || {
match vm_evt_rdtube.recv::<VmEventType>() {
Ok(vm_event) => vm_event == VmEventType::WatchdogReset,
Err(_e) => false,