blob: 4d6db01a8847b3f41adaa980ccc8957036d0f542 [file] [log] [blame]
// Copyright 2024 The ChromiumOS Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package main
import (
"context"
"fmt"
"log"
"strings"
"time"
"go.chromium.org/chromiumos/config/go/test/api"
)
const (
// This is possible when the DUT has rebooted.
noPerfStatus = "status:1"
// Error status 42 means the DUT connection was not alive.
// which is possible when:
// 1. DUT is in sleep, suspended or rebooting.
// 2. transient lab network issues.
// These are probably intermittent states and not a cause for Error.
noConnStatus = "status:42"
needRestart = "perf need restart"
)
type perfStarter interface {
startPerf(ctx context.Context) (string, error)
}
type monitor struct {
exec execUtil
perf perfStarter
dut api.DutServiceClient
healthCheckInterval time.Duration
// stopMonitoring chan is used to notify monitor to stop monitoring.
stopMonitoring chan bool
// terminated chan is used by the monitor to notify that it has successfully terminated.
terminated chan bool
}
func (m *monitor) startPerf(ctx context.Context) (string, error) {
p, err := m.perf.startPerf(ctx)
if err != nil {
err = fmt.Errorf("error while starting perf: %v", err)
log.Println(err)
return "", err
}
return p, nil
}
func (m *monitor) checkStatus(ctx context.Context, pid string) error {
psResult, err := m.exec.RunCmd(ctx, ps, []string{"-p", pid, "-o", "comm="}, m.dut)
if err == nil {
log.Println("perf is healthy.")
return nil
} else if strings.Contains(err.Error(), noPerfStatus) {
log.Println("No running perf found on the DUT. Will need to restart it.")
return fmt.Errorf(needRestart)
} else if strings.Contains(err.Error(), noConnStatus) {
log.Println("Warning received status 42 while during health check.")
return nil
} else {
return fmt.Errorf("Error:ps command failed with unexpected error: %v, command result %v", psResult, err)
}
}
// start will start monitoring the health of perf process.
// If the process is not found, it will restart it.
func (m *monitor) start(pid string) {
log.Println("Starting to monitoring.")
ctx := context.Background()
if pid == "" {
if p, err := m.startPerf(ctx); err != nil {
m.terminated <- true
return
} else {
pid = p
}
}
for {
select {
case <-m.stopMonitoring:
log.Println("Stopping monitoring.")
m.terminated <- true
return
case <-time.After(m.healthCheckInterval):
if err := m.checkStatus(ctx, pid); err != nil {
if strings.Contains(err.Error(), needRestart) {
if p, err := m.startPerf(ctx); err != nil {
m.terminated <- true
return
} else {
pid = p
}
} else {
log.Println(err)
m.terminated <- true
return
}
}
}
}
}