blob: 41ec112720ace528dbd8929d8550b7ac0fd2c69d [file] [log] [blame]
// Copyright 2018 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Package ready provides functions to be passed as a "ready" function to the
// bundle main function.
package ready
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"regexp"
"strconv"
"strings"
"syscall"
"time"
"github.com/shirou/gopsutil/process"
"chromiumos/tast/common/testexec"
upstartcommon "chromiumos/tast/common/upstart"
"chromiumos/tast/errors"
"chromiumos/tast/local/crash"
"chromiumos/tast/local/dbusutil"
"chromiumos/tast/local/hwsec"
"chromiumos/tast/local/upstart"
"chromiumos/tast/testing"
)
// Wait waits until the system is (marginally) ready for tests to run.
// Tast can sometimes be run against a freshly-booted VM, and we don't want every test that
// depends on a critical daemon to need to call upstart.WaitForJobStatus to wait for the
// corresponding job to be running. See https://crbug.com/897521 for more details.
func Wait(ctx context.Context) error {
// Periodically log a message to make it clearer what we're doing.
// Sending a periodic control message is also needed to let the main tast process
// know that the DUT is still responsive.
done := make(chan struct{})
defer func() { done <- struct{}{} }()
go func() {
for {
select {
case <-time.After(30 * time.Second):
testing.ContextLog(ctx, "Still waiting for important system services to be running")
case <-done:
return
}
}
}()
killOrphanAutotestd(ctx)
clearPolicies(ctx)
// Delete all core dumps to free up spaces.
if err := crash.DeleteCoreDumps(ctx); err != nil {
testing.ContextLog(ctx, "Failed to delete core dumps: ", err)
}
// If system-services doesn't enter "start/running", everything's probably broken, so give up.
const systemServicesJob = "system-services"
if err := upstart.WaitForJobStatus(ctx, systemServicesJob, upstartcommon.StartGoal, upstartcommon.RunningState,
upstart.TolerateWrongGoal, 2*time.Minute); err != nil {
return errors.Wrapf(err, "failed waiting for %v job", systemServicesJob)
}
// Start the ui job if it is not running. When the ui job is stopped, some
// daemons (e.g. debugd) are also stopped, so it is not useful to wait for
// them without starting the ui job.
if err := upstart.EnsureJobRunning(ctx, "ui"); err != nil {
return errors.Wrap(err, "failed to start ui")
}
// Make a best effort for important daemon jobs that start later, but just log errors instead of failing.
// We don't want to abort the whole test run if there's a bug in a daemon that prevents it from starting.
var daemonJobs = []string{
"attestationd",
"cryptohomed",
"debugd",
"metrics_daemon",
"shill",
"tpm_managerd",
}
type jobError struct {
job string
err error
}
ch := make(chan *jobError)
for _, job := range daemonJobs {
go func(job string) {
// Some Chrome-OS-derived systems may not have all of these jobs.
if !upstart.JobExists(ctx, job) {
ch <- nil
} else if err := upstart.WaitForJobStatus(ctx, job, upstartcommon.StartGoal, upstartcommon.RunningState,
upstart.TolerateWrongGoal, time.Minute); err == nil {
ch <- nil
} else {
ch <- &jobError{job, err}
}
}(job)
}
for range daemonJobs {
if je := <-ch; je != nil {
testing.ContextLogf(ctx, "Failed waiting for job %v: %v", je.job, je.err)
}
}
if upstart.JobExists(ctx, "cryptohomed") {
if err := waitForCryptohomeService(ctx); err != nil {
testing.ContextLog(ctx, "Failed waiting for cryptohome D-Bus service: ", err)
} else {
if hasTPM(ctx) {
if err := ensureTPMInitialized(ctx); err != nil {
testing.ContextLog(ctx, "Failed ensuring that TPM is initialized: ", err)
}
checkEnterpriseOwned(ctx)
} else {
testing.ContextLog(ctx, "TPM not available, not waiting for readiness")
}
}
}
if err := hwsec.BackupTPMManagerDataIfIntact(ctx); err != nil {
testing.ContextLog(ctx, "Failed to backup tpm manager local data: ", err)
}
return nil
}
// isAutotestd returns whether p is an autotestd process.
func isAutotestd(p *process.Process) bool {
cmd, err := p.CmdlineSlice()
// Process is killed or arguments are unavailable (e.g. system processes).
if err != nil || len(cmd) == 0 {
return false
}
if filepath.Base(cmd[0]) == "autotestd" {
return true
}
if filepath.Base(cmd[0]) == "python2.7" {
for _, s := range cmd[1:] {
if strings.HasPrefix(s, "-") {
// Skip flags (if any).
continue
}
return filepath.Base(s) == "autotestd"
}
}
return false
}
// killOrphanAutotestd sends SIGKILL to running autotestd processes and their
// subprocesses. This works around the known issue that autotestd from timed out
// jobs interferes with Tast tests (crbug.com/874333, crbug.com/977035).
func killOrphanAutotestd(ctx context.Context) {
ps, err := process.Processes()
if err != nil {
testing.ContextLog(ctx, "Failed to enumerate processes: ", err)
return
}
for _, p := range ps {
if !isAutotestd(p) {
continue
}
// Extract the process group ID of autotestd from ps output.
// Unfortunately gopsutil does not support getting it.
out, err := exec.Command("ps", "-o", "pgid=", strconv.Itoa(int(p.Pid))).Output()
if err != nil {
testing.ContextLog(ctx, "ps command failed: ", err)
}
pgid, err := strconv.Atoi(strings.TrimSpace(string(out)))
if err != nil {
testing.ContextLog(ctx, "Failed to parse ps command output: ", err)
}
testing.ContextLogf(ctx, "Killing orphan autotestd (pid=%d, pgid=%d)", p.Pid, pgid)
if err := syscall.Kill(-pgid, syscall.SIGKILL); err != nil {
testing.ContextLog(ctx, "Failed to kill autotestd: ", err)
}
}
}
// hasTPM checks whether the DUT has a TPM.
func hasTPM(ctx context.Context) bool {
const noTPMError = "Communication failure"
out, err := exec.Command("tpm_version").CombinedOutput()
if err != nil {
if exitError, ok := err.(*exec.ExitError); ok {
if !strings.Contains(string(out), noTPMError) {
// Only log unexpected errors. Communication failure is expected on devices without TPMs.
testing.ContextLogf(ctx, "tpm_version exited with code %d: %s", exitError.ExitCode(), string(out))
}
} else {
testing.ContextLog(ctx, "Failed to run tpm_version: ", err)
}
return false
}
return true
}
// waitForCryptohomeService waits for cryptohomed's D-Bus service to become available.
func waitForCryptohomeService(ctx context.Context) error {
const (
svc = "org.chromium.UserDataAuth"
svcTimeout = 15 * time.Second
minUptime = 10 * time.Second
)
bus, err := dbusutil.SystemBus()
if err != nil {
return errors.Wrap(err, "failed to connect to system bus")
}
wctx, wcancel := context.WithTimeout(ctx, svcTimeout)
defer wcancel()
if err = dbusutil.WaitForService(wctx, bus, svc); err != nil {
return errors.Wrapf(err, "%s D-Bus service unavailable", svc)
}
// Make sure that we don't start using cryptohomed immediately after it registers its D-Bus service,
// as that seems to sometimes cause it to hang: https://crbug.com/901363#c3, https://crbug.com/902199
// In practice, this only matters on freshly-started VMs -- cryptohomed has typically been running for
// a while on real hardware in the labs.
uptime, err := getCryptohomedUptime()
if err != nil {
return errors.Wrap(err, "failed to get process uptime")
}
if uptime < minUptime {
d := minUptime - uptime
testing.ContextLogf(ctx, "Waiting %v for cryptohomed to stabilize", d.Round(time.Millisecond))
if err := testing.Sleep(ctx, d); err != nil {
return err
}
}
return nil
}
// getCryptohomedUptime finds the cryptohomed process and returns how long it's been running.
func getCryptohomedUptime() (time.Duration, error) {
procs, err := process.Processes()
if err != nil {
return 0, err
}
var proc *process.Process
for _, p := range procs {
if exe, err := p.Exe(); err == nil && filepath.Base(exe) == "cryptohomed" {
proc = p
break
}
}
if proc == nil {
return 0, errors.New("didn't find process")
}
ms, err := proc.CreateTime() // milliseconds since epoch
if err != nil {
return 0, errors.Wrap(err, "failed to get start time")
}
d := time.Duration(ms) * time.Millisecond
ct := time.Unix(int64(d/time.Second), int64((d%time.Second)/time.Nanosecond))
return time.Since(ct), nil
}
// These match lines in the output from "tpm_manager_client status --nonsensitive".
var tpmEnabledRegexp = regexp.MustCompile(`(?m)^\s*is_enabled:\s*true\s*$`)
var tpmOwnedRegexp = regexp.MustCompile(`(?m)^\s*is_owned:\s*true\s*$`)
// These match lines in the output from "attestation_client status".
var tpmInitializedRegexp = regexp.MustCompile(`(?m)^\s*prepared_for_enrollment:\s*true\s*$`)
// ensureTPMInitialized checks if the TPM is already initialized and tries to take ownership if not.
// nil is returned if the TPM is not enabled (as is the case on VMs).
func ensureTPMInitialized(ctx context.Context) error {
ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
defer cancel()
tpmStatus := func(ctx context.Context) (enabled, initialized, owned bool, err error) {
tpmOut, err := testexec.CommandContext(ctx, "tpm_manager_client", "status", "--nonsensitive").Output()
if err != nil {
return false, false, false, err
}
attestOut, err := testexec.CommandContext(ctx, "attestation_client", "status").Output()
if err != nil {
return false, false, false, err
}
return tpmEnabledRegexp.Match(tpmOut), tpmInitializedRegexp.Match(attestOut), tpmOwnedRegexp.Match(tpmOut), nil
}
// Check if the TPM is disabled or already initialized.
enabled, initialized, owned, err := tpmStatus(ctx)
if err != nil {
return err
} else if !enabled || initialized {
return nil
}
testing.ContextLog(ctx, "TPM not initialized; taking ownership now to ensure that tests aren't blocked during login")
if owned {
testing.ContextLog(ctx, "TPM is already owned; finishing initialization")
}
if err := testexec.CommandContext(ctx, "tpm_manager_client", "take_ownership").Run(); err != nil {
return err
}
return testing.Poll(ctx, func(ctx context.Context) error {
if _, initialized, _, err := tpmStatus(ctx); err != nil {
// cryptohome error encountered while polling.
return testing.PollBreak(err)
} else if !initialized {
return errors.New("TPM not initialized yet")
}
return nil
}, nil)
}
// ClearPoliciesLogLocation is the location of the error log for clearPolicies.
// If the removing policies did not encounter any errors the file should not exist.
const ClearPoliciesLogLocation = "/tmp/ready-clearPolicies.err"
func errorLogAppendError(path, msg string) error {
f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
return errors.Wrapf(err, "failed to open error log %q", path)
}
defer f.Close()
if _, err := f.WriteString(msg + "\n"); err != nil {
return errors.Wrapf(err, "failed to append %q to error log %q", msg, path)
}
return nil
}
// clearPolicies removes all policies that might have been set by previously running tests.
// Tests that do not work with policies might still be affected by them, so this brings the device back to the default state.
// It is possible that device is already enrolled, but to unenroll the device we need a reboot, so we can do nothing here.
func clearPolicies(ctx context.Context) {
// /var/lib/whitelist is a directory containing device policies.
// /home/chronos/Local State is a file containing local state JSON including user policy data.
policyFiles := []string{"/var/lib/whitelist", "/home/chronos/Local State"}
// Clear error log for this function.
if err := os.RemoveAll(ClearPoliciesLogLocation); err != nil {
testing.ContextLogf(ctx, "Failed to remove error log %q: %v", ClearPoliciesLogLocation, err)
}
for _, path := range policyFiles {
if err := os.RemoveAll(path); err != nil {
msg := fmt.Sprintf("Failed to remove %q: %v", path, err)
testing.ContextLog(ctx, msg)
if err := errorLogAppendError(ClearPoliciesLogLocation, msg); err != nil {
testing.ContextLog(ctx, err.Error())
}
}
}
// Services that cache policies (like Chromium) are not restarted here.
// Tests that depend on the state of those services should perform the restart.
// Chromium related tests already restart Chromium and session_manager which will reload policies.
}
// EnterpriseOwnedLogLocation is the location of the error log for checkEnterpriseOwned.
// If the the DUT is not enterprise owned the file should not exist.
const EnterpriseOwnedLogLocation = "/tmp/ready-enterpriseOwned.err"
var trueRegex = regexp.MustCompile(`(?m)^\s*[Tt]rue\s*$`)
func checkEnterpriseOwned(ctx context.Context) {
isEnterpriseOwned := func(ctx context.Context) bool {
out, err := testexec.CommandContext(ctx, "cryptohome", "--action=install_attributes_get", "--name=enterprise.owned").Output()
if err != nil {
// Don't fail as install attributes can be missing. Device is not
// enterprise owned in that case.
return false
}
owned := trueRegex.Match(out)
return owned
}
// Clear error log for this function.
if err := os.RemoveAll(EnterpriseOwnedLogLocation); err != nil {
testing.ContextLogf(ctx, "Failed to remove error log %q: %v", EnterpriseOwnedLogLocation, err)
}
if isEnterpriseOwned(ctx) {
if err := errorLogAppendError(EnterpriseOwnedLogLocation, "Device is enterprise owned"); err != nil {
testing.ContextLog(ctx, err.Error())
}
testing.ContextLog(ctx, "Device is enterprise owned, please clear ownership before running tests")
testing.ContextLog(ctx, "To clear ownership you can powerwash (Ctrl + Alt + Shift + r at the login screen)")
}
}