| // Copyright 2018 The Chromium OS Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // Package cpu measures CPU usage. |
| package cpu |
| |
| import ( |
| "context" |
| "io/ioutil" |
| "os" |
| "path/filepath" |
| "regexp" |
| "strconv" |
| "sync" |
| "syscall" |
| "time" |
| |
| "github.com/shirou/gopsutil/cpu" |
| |
| "chromiumos/tast/common/testexec" |
| upstartcommon "chromiumos/tast/common/upstart" |
| "chromiumos/tast/ctxutil" |
| "chromiumos/tast/errors" |
| "chromiumos/tast/local/gtest" |
| "chromiumos/tast/local/upstart" |
| "chromiumos/tast/testing" |
| ) |
| |
| // ExitOption describes how to clean up the child process upon function exit. |
| type ExitOption int |
| |
| const ( |
| // KillProcess option kills the child process when the function is done. |
| KillProcess ExitOption = iota |
| // WaitProcess option waits for the child process to finish. |
| WaitProcess |
| ) |
| |
| // raplExec is the command used to measure power consumption, only supported on Intel platforms. |
| const raplExec = "/usr/bin/dump_intel_rapl_consumption" |
| |
| // MeasureProcessUsage starts one or more gtest processes and measures CPU usage and power consumption asynchronously |
| // for the given duration. A map is returned containing CPU usage (percentage in [0-100] range) with key "cpu" and power |
| // consumption (Watts) with key "power" if supported. |
| func MeasureProcessUsage(ctx context.Context, duration time.Duration, |
| exitOption ExitOption, ts ...*gtest.GTest) (measurements map[string]float64, retErr error) { |
| const ( |
| stabilizeTime = 1 * time.Second // time to wait for CPU to stabilize after launching proc. |
| cleanupTime = 5 * time.Second // time reserved for cleanup after measuring. |
| ) |
| |
| for _, t := range ts { |
| // Start the process asynchronously by calling the provided startup function. |
| cmd, err := t.Start(ctx) |
| if err != nil { |
| return nil, errors.Wrap(err, "failed to run binary") |
| } |
| |
| // Clean up the process upon exiting the function. |
| defer func() { |
| // If the exit option is 'WaitProcess' wait for the process to terminate. |
| if exitOption == WaitProcess { |
| if err := cmd.Wait(); err != nil { |
| retErr = err |
| testing.ContextLog(ctx, "Failed waiting for the command to exit: ", retErr) |
| } |
| return |
| } |
| |
| // If the exit option is 'KillProcess' we will send a 'SIGKILL' signal |
| // to the process after collecting performance metrics. |
| if err := cmd.Kill(); err != nil { |
| retErr = err |
| testing.ContextLog(ctx, "Failed to kill process: ", retErr) |
| return |
| } |
| |
| // After sending a 'SIGKILL' signal to the process we need to wait |
| // for the process to terminate. If Wait() doesn't return any error, |
| // we know the process already terminated before we explicitly killed |
| // it and the measured performance metrics are invalid. |
| err = cmd.Wait() |
| if err == nil { |
| retErr = errors.New("process did not run for entire measurement duration") |
| testing.ContextLog(ctx, retErr) |
| return |
| } |
| |
| // Check whether the process was terminated with a 'SIGKILL' signal. |
| ws, ok := testexec.GetWaitStatus(err) |
| if !ok { |
| retErr = errors.Wrap(err, "failed to get wait status") |
| testing.ContextLog(ctx, retErr) |
| } else if !ws.Signaled() || ws.Signal() != syscall.SIGKILL { |
| retErr = errors.Wrap(err, "process did not terminate with SIGKILL signal") |
| testing.ContextLog(ctx, retErr) |
| } |
| }() |
| } |
| |
| // Use a shorter context to leave time for cleanup upon failure. |
| ctx, cancel := ctxutil.Shorten(ctx, cleanupTime) |
| defer cancel() |
| |
| if err := testing.Sleep(ctx, stabilizeTime); err != nil { |
| return nil, errors.Wrap(err, "failed waiting for CPU usage to stabilize") |
| } |
| |
| testing.ContextLog(ctx, "Measuring CPU usage and power consumption for ", duration.Round(time.Second)) |
| return MeasureUsage(ctx, duration) |
| } |
| |
| // SetUpBenchmark performs setup needed for running benchmarks. It disables CPU frequency scaling |
| // and thermal throttling, and waits for the CPU to become idle. A deferred call to the returned |
| // cleanUp function should be scheduled by the caller if err is non-nil. |
| func SetUpBenchmark(ctx context.Context) (cleanUp func(ctx context.Context), err error) { |
| const cleanupTime = 10 * time.Second // time reserved for cleanup on error. |
| |
| var restoreScaling func(ctx context.Context) error |
| var restoreThrottling func(ctx context.Context) error |
| cleanUp = func(ctx context.Context) { |
| if restoreScaling != nil { |
| if err = restoreScaling(ctx); err != nil { |
| testing.ContextLog(ctx, "Failed to restore CPU frequency scaling to original values: ", err) |
| } |
| } |
| if restoreThrottling != nil { |
| if err = restoreThrottling(ctx); err != nil { |
| testing.ContextLog(ctx, "Failed to restore CPU thermal throttling to original values: ", err) |
| } |
| } |
| } |
| |
| // Run the cleanUp function automatically if we encounter an error. |
| doCleanup := cleanUp |
| defer func() { |
| if doCleanup != nil { |
| doCleanup(ctx) |
| } |
| }() |
| |
| // Run all non-cleanup operations with a shorter context. This ensures |
| // thermal throttling and CPU frequency scaling get re-enabled, even when |
| // test execution exceeds the maximum time allowed. |
| ctx, cancel := ctxutil.Shorten(ctx, cleanupTime) |
| defer cancel() |
| |
| // CPU frequency scaling and thermal throttling might influence our test results. |
| if restoreScaling, err = disableCPUFrequencyScaling(ctx); err != nil { |
| return nil, errors.Wrap(err, "failed to disable CPU frequency scaling") |
| } |
| if restoreThrottling, err = disableThermalThrottling(ctx); err != nil { |
| return nil, errors.Wrap(err, "failed to disable thermal throttling") |
| } |
| |
| // Disarm running the cleanUp function now that we expect the caller to do it. |
| doCleanup = nil |
| return cleanUp, nil |
| } |
| |
| // WaitUntilIdle waits until the CPU is idle, for a maximum of 120s. The CPU is |
| // considered idle if the average usage over all CPU cores is less than 5%. |
| // This percentage will be gradually increased to 20%, as older boards might |
| // have a hard time getting below 5%. |
| func WaitUntilIdle(ctx context.Context) error { |
| const ( |
| // time to wait for CPU to become idle. |
| waitIdleCPUTimeout = 120 * time.Second |
| // percentage below which CPU is ideally considered idle, gradually |
| // increased up to idleCPUUsagePercentMax. |
| idleCPUUsagePercentBase = 5.0 |
| // maximum percentage below which CPU is considered idle. |
| idleCPUUsagePercentMax = 20.0 |
| // times we wait for CPU to become idle, idle percentage is increased each time. |
| idleCPUSteps = 5 |
| ) |
| |
| // Wait for the CPU to become idle. It's e.g. possible the board just booted |
| // and is running various startup programs. Some slower platforms have a |
| // hard time getting below 10% CPU usage, so we'll gradually increase the |
| // CPU idle threshold. |
| var err error |
| startTime := time.Now() |
| idleIncrease := (idleCPUUsagePercentMax - idleCPUUsagePercentBase) / (idleCPUSteps - 1) |
| testing.ContextLogf(ctx, "Waiting for idle CPU at most %v, threshold will be gradually relaxed (from %.1f%% to %.1f%%)", |
| waitIdleCPUTimeout, idleCPUUsagePercentBase, idleCPUUsagePercentMax) |
| for i := 0; i < idleCPUSteps; i++ { |
| idlePercent := idleCPUUsagePercentBase + (idleIncrease * float64(i)) |
| timeout := waitIdleCPUTimeout / idleCPUSteps |
| testing.ContextLogf(ctx, "Waiting up to %v for CPU usage to drop below %.1f%% (%d/%d)", |
| timeout.Round(time.Second), idlePercent, i+1, idleCPUSteps) |
| var usage float64 |
| if usage, err = waitUntilIdleStep(ctx, timeout, idlePercent); err == nil { |
| testing.ContextLogf(ctx, "Waiting for idle CPU took %v (usage: %.1f%%, threshold: %.1f%%)", |
| time.Now().Sub(startTime).Round(time.Second), usage, idlePercent) |
| return nil |
| } |
| } |
| return err |
| } |
| |
| // waitUntilIdleStep waits until the CPU is idle or the specified timeout has |
| // elapsed and returns CPU usage. The CPU is considered idle if the average CPU |
| // usage over all cores is less than maxUsage, which is a percentage in the |
| // range [0.0, 100.0]. |
| func waitUntilIdleStep(ctx context.Context, timeout time.Duration, maxUsage float64) (usage float64, err error) { |
| const measureDuration = time.Second |
| err = testing.Poll(ctx, func(context.Context) error { |
| var e error |
| // testing.Poll shortens ctx so that its deadline matches timeout. Use the original ctx to |
| // prevent the Sleep in MeasureCPUUsage from always failing during the last poll iteration. |
| usage, e = MeasureCPUUsage(ctx, measureDuration) |
| if e != nil { |
| return testing.PollBreak(errors.Wrap(e, "failed measuring CPU usage")) |
| } |
| if usage >= maxUsage { |
| return errors.Errorf("CPU not idle: got %.1f%%; want < %.1f%%", usage, maxUsage) |
| } |
| return nil |
| }, &testing.PollOptions{Timeout: timeout}) |
| if err != nil { |
| return usage, err |
| } |
| return usage, nil |
| } |
| |
| // MeasureUsage measures the average utilization across all CPUs and the |
| // average SoC 'pkg' power consumption during the specified duration. Measuring |
| // power consumption is currently not supported on all platforms. A map is |
| // returned containing CPU usage (percentage in [0-100] range) and power |
| // consumption (Watts) if supported. |
| func MeasureUsage(ctx context.Context, duration time.Duration) (map[string]float64, error) { |
| var cpuUsage, powerConsumption float64 |
| var cpuErr, powerErr error |
| var wg sync.WaitGroup |
| |
| // Start measuring CPU usage asynchronously. |
| wg.Add(1) |
| go func() { |
| defer wg.Done() |
| cpuUsage, cpuErr = MeasureCPUUsage(ctx, duration) |
| }() |
| |
| // Start measuring power consumption asynchronously. Power consumption |
| // is currently only measured on Intel devices that support the |
| // dump_intel_rapl_consumption command. |
| if _, powerErr = os.Stat(raplExec); powerErr == nil { |
| wg.Add(1) |
| go func() { |
| defer wg.Done() |
| if powerConsumption, powerErr = MeasurePowerConsumption(ctx, duration); powerErr != nil { |
| testing.ContextLog(ctx, "Measuring power consumption failed: ", powerErr) |
| } |
| }() |
| } |
| |
| wg.Wait() |
| |
| measurements := make(map[string]float64) |
| if cpuErr == nil { |
| measurements["cpu"] = cpuUsage |
| } |
| if powerErr == nil { |
| measurements["power"] = powerConsumption |
| } |
| |
| // Ignore powerErr as not all platforms support measuring power consumption. |
| return measurements, cpuErr |
| } |
| |
| // MeasureCPUUsage measures utilization across all CPUs during duration. |
| // Returns a percentage in the range [0.0, 100.0]. |
| func MeasureCPUUsage(ctx context.Context, duration time.Duration) (float64, error) { |
| // Get the total time the CPU spent in different states (read from |
| // /proc/stat on linux machines). |
| statBegin, err := getStat() |
| if err != nil { |
| return 0, err |
| } |
| |
| if err := testing.Sleep(ctx, duration); err != nil { |
| return 0, err |
| } |
| |
| // Get the total time the CPU spent in different states again. By looking at |
| // the difference with the values we got earlier, we can calculate the time |
| // the processor was idle. The gopsutil library also has a function that |
| // does this directly, but unfortunately we can't use it as that function |
| // doesn't abort when the timeout in ctx is exceeded. |
| statEnd, err := getStat() |
| if err != nil { |
| return 0, err |
| } |
| |
| totalTimeBegin := statBegin.Total() |
| activeTimeBegin := totalTimeBegin - (statBegin.Idle + statBegin.Iowait) |
| totalTimeEnd := statEnd.Total() |
| activeTimeEnd := totalTimeEnd - (statEnd.Idle + statEnd.Iowait) |
| |
| if totalTimeEnd <= totalTimeBegin { |
| return 0.0, errors.Errorf("total time went from %f to %f", totalTimeBegin, totalTimeEnd) |
| } |
| |
| return (activeTimeEnd - activeTimeBegin) / (totalTimeEnd - totalTimeBegin) * 100.0, nil |
| } |
| |
| // MeasurePowerConsumption measures power consumption during the specified |
| // duration and returns the average power consumption (in Watts). The power |
| // consumption is acquired by reading the RAPL 'pkg' entry, which gives a |
| // measure of the total SoC power consumption. |
| func MeasurePowerConsumption(ctx context.Context, duration time.Duration) (float64, error) { |
| cmd := testexec.CommandContext(ctx, raplExec, "--interval_ms="+ |
| strconv.FormatInt(int64(duration/time.Millisecond), 10)) |
| powerConsumptionOutput, err := cmd.CombinedOutput() |
| if err != nil { |
| return 0.0, err |
| } |
| |
| var powerConsumptionRegex = regexp.MustCompile(`(\d+\.\d+)`) |
| match := powerConsumptionRegex.FindAllString(string(powerConsumptionOutput), 1) |
| if len(match) != 1 { |
| return 0.0, errors.Errorf("failed to parse output of %s", raplExec) |
| } |
| powerConsumption, err := strconv.ParseFloat(match[0], 64) |
| if err != nil { |
| return 0.0, err |
| } |
| |
| return powerConsumption, nil |
| } |
| |
| // getStat returns utilization stats across all CPUs as reported by /proc/stat. |
| func getStat() (*cpu.TimesStat, error) { |
| times, err := cpu.Times(false) |
| if err != nil { |
| return nil, err |
| } |
| return ×[0], nil |
| } |
| |
| // cpuConfigEntry holds a single CPU config entry. If ignoreErrors is true |
| // failure to apply the config will result in a warning, rather than an error. |
| // This is needed as on some platforms we might not have the right permissions |
| // to disable frequency scaling. |
| type cpuConfigEntry struct { |
| path string |
| value string |
| ignoreErrors bool |
| } |
| |
| // disableCPUFrequencyScaling disables frequency scaling. All CPU cores will be |
| // set to always run at their maximum frequency. A function is returned so the |
| // caller can restore the original CPU frequency scaling configuration. |
| // Depending on the platform different mechanisms are present: |
| // - Some Intel-based platforms (e.g. Eve and Nocturne) ignore the values set |
| // in the scaling_governor, and instead use the intel_pstate application to |
| // control CPU frequency scaling. |
| // - Most platforms use the scaling_governor to control CPU frequency scaling. |
| // - Some platforms (e.g. Dru) use a different CPU frequency scaling governor. |
| func disableCPUFrequencyScaling(ctx context.Context) (func(ctx context.Context) error, error) { |
| configPatterns := []cpuConfigEntry{ |
| // crbug.com/938729: BIOS settings might prevent us from overwriting intel_pstate/no_turbo. |
| {"/sys/devices/system/cpu/intel_pstate/no_turbo", "1", true}, |
| // Fix the intel_pstate percentage to 100 if possible. We raise the |
| // maximum value before the minimum value as the min cannot exceed the |
| // max. To restore them, the order must be inverted. Note that we set |
| // and save the original values for these values because changing |
| // scaling_governor to "performance" can change these values as well. |
| {"/sys/devices/system/cpu/intel_pstate/max_perf_pct", "100", false}, |
| {"/sys/devices/system/cpu/intel_pstate/min_perf_pct", "100", false}, |
| // crbug.com/977925: Disabled hyperthreading cores are listed but |
| // writing config for these disabled cores results in 'invalid argument'. |
| // TODO(dstaessens): Skip disabled CPU cores when setting scaling_governor. |
| {"/sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_governor", "performance", true}, |
| {"/sys/class/devfreq/devfreq[0-9]*/governor", "performance", true}, |
| } |
| |
| var optimizedConfig []cpuConfigEntry |
| // Expands patterns in configPatterns and pack actual configs into |
| // optimizedConfig. |
| for _, config := range configPatterns { |
| paths, err := filepath.Glob(config.path) |
| if err != nil { |
| return nil, err |
| } |
| for _, path := range paths { |
| optimizedConfig = append(optimizedConfig, cpuConfigEntry{ |
| path, |
| config.value, |
| config.ignoreErrors, |
| }) |
| } |
| } |
| |
| origConfig, err := applyConfig(ctx, optimizedConfig) |
| undo := func(ctx context.Context) error { |
| _, err := applyConfig(ctx, origConfig) |
| return err |
| } |
| if err != nil { |
| undo(ctx) |
| return nil, err |
| } |
| return undo, nil |
| } |
| |
| // applyConfig applies the specified frequency scaling configuration. A slice of |
| // cpuConfigEntry needs to be provided and will be processed in order. A slice |
| // of the original cpuConfigEntry values that were successfully processed is |
| // returned in reverse order so the caller can restore the original config by |
| // passing the slice to this function as is. If ignoreErrors is true for a |
| // config entry we won't return an error upon failure, but will only show a |
| // warning. The provided context will only be used for logging, so the config |
| // will even be applied upon timeout. |
| func applyConfig(ctx context.Context, cpuConfig []cpuConfigEntry) ([]cpuConfigEntry, error) { |
| var origConfig []cpuConfigEntry |
| for _, config := range cpuConfig { |
| origValue, err := ioutil.ReadFile(config.path) |
| if err != nil { |
| if !config.ignoreErrors { |
| return origConfig, err |
| } |
| testing.ContextLogf(ctx, "Failed to read %v: %v", config.path, err) |
| continue |
| } |
| if err = ioutil.WriteFile(config.path, []byte(config.value), 0644); err != nil { |
| if !config.ignoreErrors { |
| return origConfig, err |
| } |
| testing.ContextLogf(ctx, "Failed to write to %v: %v", config.path, err) |
| continue |
| } |
| // Inserts a new entry at the front of origConfig. |
| e := cpuConfigEntry{config.path, string(origValue), false} |
| origConfig = append([]cpuConfigEntry{e}, origConfig...) |
| } |
| return origConfig, nil |
| } |
| |
| // disableThermalThrottling disables thermal throttling, as it might interfere |
| // with test execution. A function is returned that restores the original |
| // settings, so the caller can re-enable thermal throttling after testing. |
| func disableThermalThrottling(ctx context.Context) (func(context.Context) error, error) { |
| job := getThermalThrottlingJob(ctx) |
| if job == "" { |
| return func(ctx context.Context) error { return nil }, nil |
| } |
| |
| _, state, _, err := upstart.JobStatus(ctx, job) |
| if err != nil { |
| return nil, err |
| } else if state != upstartcommon.RunningState { |
| return func(ctx context.Context) error { return nil }, nil |
| } |
| |
| if err := upstart.StopJob(ctx, job); err != nil { |
| return nil, err |
| } |
| |
| undo := func(ctx context.Context) error { return upstart.EnsureJobRunning(ctx, job) } |
| return undo, nil |
| } |
| |
| // getThermalThrottlingJob tries to determine the name of the thermal throttling |
| // job used by the current platform. |
| func getThermalThrottlingJob(ctx context.Context) string { |
| // List of possible thermal throttling jobs that should be disabled: |
| // - dptf for intel >= baytrail |
| // - temp_metrics for link |
| // - thermal for daisy, snow, pit,... |
| for _, job := range []string{"dptf", "temp_metrics", "thermal"} { |
| if upstart.JobExists(ctx, job) { |
| return job |
| } |
| } |
| return "" |
| } |