blob: b00db3c35494485b136c3ce98bd3089183deb915 [file] [log] [blame]
// Copyright 2018 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "cfm-device-monitor/apex-monitor/apex_monitor.h"
#include <base/bind.h>
#include <base/bind_helpers.h>
#include <base/files/file_path.h>
#include <base/files/file_util.h>
#include <base/files/scoped_file.h>
#include <base/strings/stringprintf.h>
#include <brillo/message_loops/message_loop.h>
#include <fcntl.h>
#include <string>
#include <utility>
namespace {
const int kApexMaxRebootLimit = 5;
const int kApexCheckIntervalMs = 60e3; // 60s.
const int kResetSleepTimeMs = 5e3; // 5s.
const int kPowerCycleFailSleepTimeMs = 10e3; // 10s.
const int kPowerCycleSleepTimeMs = 5e3; // 5s.
// Overtemp warning dual-threshold.
const int kThermalWarningHighCelcius = 135;
const int kThermalWarningLowCelcius = 115;
} // namespace
namespace apex_monitor {
ApexMonitor::ApexMonitor(std::unique_ptr<ApexManager> apex_manager)
: chip_id_(apex_manager->ChipID()),
apex_manager_(std::move(apex_manager)) {}
ApexMonitor::~ApexMonitor() {}
void ApexMonitor::StartMonitor() { PostTask(&ApexMonitor::CheckChipStatus); }
template <typename Functor>
void ApexMonitor::PostTask(Functor&& task, const int& delay_ms) {
brillo::MessageLoop::current()->PostDelayedTask(
FROM_HERE, base::Bind(task, weak_factory_.GetWeakPtr()),
base::TimeDelta::FromMilliseconds(delay_ms));
}
void ApexMonitor::CheckChipStatus() {
VLOG(1) << "Checking chip " << chip_id_ << " status registers...";
// If failed to read status register, reset the chip.
if (!apex_manager_->CheckStatusRegister(&power_good_, &boot_fail_)) {
LOG(WARNING) << "Failed to read chip status from IO expander";
PostTask(&ApexMonitor::ResetChip);
return;
}
// If power not good, power cycle the chip.
if (!power_good_) {
// Please see b/109866345 before modyfing the below line.
LOG(WARNING) << "Detect Apex " << chip_id_
<< " power failure. Power-cycling chip.";
PostTask(&ApexMonitor::PowerCycleChip);
return;
}
// If boot fail, reset the chip.
if (boot_fail_) {
// Please see b/109866345 before modyfing the below line.
LOG(WARNING) << "Detect Apex " << chip_id_
<< " boot failure. Resetting chip.";
PostTask(&ApexMonitor::ResetChip);
return;
}
// Chip bootup fine, read chip temperature.
VLOG(1) << "Chip " << chip_id_ << " status good.";
PostTask(&ApexMonitor::GetChipTemp);
}
void ApexMonitor::GetChipTemp() {
VLOG(1) << "Reading chip " << chip_id_ << " temperature...";
// If failed to read chip temperature, reset the chip.
if (!apex_manager_->ReadChipTemp(&chip_temp_)) {
PostTask(&ApexMonitor::ResetChip);
return;
}
if (chip_temp_ > kThermalWarningHighCelcius) {
if (!over_temp_) {
over_temp_ = true;
// Please see b/109866345 before modyfing the below line.
LOG(WARNING) << "Apex temperature is too high (> 135C).";
}
}
if (over_temp_ && chip_temp_ < kThermalWarningLowCelcius) {
over_temp_ = false;
LOG(WARNING) << "Apex temperature back to normal.";
}
// Wait for check interval and check again.
PostTask(&ApexMonitor::CheckChipStatus, kApexCheckIntervalMs);
}
int ApexMonitor::ChipTemp() { return chip_temp_; }
bool ApexMonitor::BootFail() { return boot_fail_; }
bool ApexMonitor::PowerGood() { return power_good_; }
void ApexMonitor::ResetChip() {
VLOG(1) << "Reseting chip " << apex_manager_->ChipID();
if (apex_manager_->ResetChip() &&
apex_manager_->CheckStatusRegister(&power_good_, &boot_fail_) &&
!boot_fail_) {
PostTask(&ApexMonitor::CheckChipStatus, kResetSleepTimeMs);
return;
}
// Please see b/109866345 before modyfing the below line.
LOG(WARNING) << "Reset failed or didn't recover chip, will power-cycle.";
PostTask(&ApexMonitor::PowerCycleChip);
}
void ApexMonitor::PowerCycleChip() {
if (power_cycle_cnt_ > kApexMaxRebootLimit) {
// Please see b/109866345 before modyfing the below line.
LOG(ERROR) << "Exceed max retry limit. Apex chip " << chip_id_
<< " is probably down. Stop monitor thread";
return;
}
VLOG(1) << "Power-cycling chip " << chip_id_;
if (apex_manager_->PowerCycleChip() &&
apex_manager_->CheckStatusRegister(&power_good_, &boot_fail_) &&
!boot_fail_) {
VLOG(1) << "Chip recovered after " << power_cycle_cnt_ << " power-cycle(s)";
// Device status registers look good.
power_cycle_cnt_ = 0;
PostTask(&ApexMonitor::CheckChipStatus, kPowerCycleSleepTimeMs);
return;
}
// Only log at first retry.
if (power_cycle_cnt_ == 0) {
LOG(WARNING) << "Power cycle failed or didn't recover chip, will retry.";
}
power_cycle_cnt_++;
PostTask(&ApexMonitor::PowerCycleChip, kPowerCycleFailSleepTimeMs);
}
} // namespace apex_monitor