blob: ab1290f1ddb3744326f2b5c73251a2d06d0efdd8 [file] [edit]
// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Package utilization provides functions to report DUT utilization metrics.
package utilization
import (
"context"
"fmt"
"go.chromium.org/luci/common/tsmon/field"
"go.chromium.org/luci/common/tsmon/metric"
swarmingv2 "go.chromium.org/luci/swarming/proto/api_v2"
"go.chromium.org/infra/cros/dutstate"
invV1 "go.chromium.org/infra/libs/skylab/inventory"
)
var dutmonMetric = metric.NewInt(
"chromeos/skylab/dut_mon/swarming_dut_count",
"The number of DUTs in a given bucket and status",
nil,
field.String("board"),
field.String("model"),
field.String("pool"),
field.String("zone"),
field.String("status"),
field.Bool("is_locked"),
)
// ReportMetrics reports DUT utilization metrics akin to dutmon in Autotest
//
// The reported fields closely match those reported by dutmon, but the metrics
// path is different.
func ReportMetrics(ctx context.Context, bis []*swarmingv2.BotInfo) {
c := make(counter)
for _, bi := range bis {
b := getBucketForBotInfo(bi)
s := getStatusForBotInfo(bi)
c.Increment(b, s)
}
c.Report(ctx)
}
// bucket contains static DUT dimensions.
//
// These dimensions do not change often. If all DUTs with a given set of
// dimensions are removed, the related metric is not automatically reset. The
// metric will get reset eventually.
type bucket struct {
board string
model string
pool string
zone string
}
func (b bucket) String() string {
return fmt.Sprintf("board: %s, model: %s, pool: %s, zone: %s", b.board, b.model, b.pool, b.zone)
}
// status is a dynamic DUT dimension.
//
// This dimension changes often. If no DUTs have a particular status value,
// the corresponding metric is immediately reset.
type status string
var allStatuses = []status{
"[None]",
"Ready",
"RepairFailed",
"NeedsRepair",
"NeedsReset",
"Running",
"NeedsDeploy",
"Deploying",
"Reserved",
"ManualRepair",
"NeedsManualRepair",
"NeedsReplacement",
"Unknown",
}
// counter collects number of DUTs per bucket and status.
type counter map[bucket]map[status]int
func (c counter) Increment(b bucket, s status) {
sc := c[b]
if sc == nil {
sc = make(map[status]int)
c[b] = sc
}
sc[s]++
}
func (c counter) Report(ctx context.Context) {
for b, counts := range c {
for _, s := range allStatuses {
// TODO(crbug/929872) Report locked status once DUT leasing is
// implemented in Skylab.
dutmonMetric.Set(ctx, int64(counts[s]), b.board, b.model, b.pool, b.zone, string(s), false)
}
}
}
func getBucketForBotInfo(bi *swarmingv2.BotInfo) bucket {
b := bucket{
board: "[None]",
model: "[None]",
pool: "[None]",
zone: "[None]",
}
for _, d := range bi.Dimensions {
switch d.Key {
case "label-board":
b.board = summarizeValues(d.Value)
case "label-model":
b.model = summarizeValues(d.Value)
case "label-pool":
b.pool = getReportPool(d.Value)
case "label-zone":
b.zone = summarizeValues(d.Value)
default:
// Ignore other dimensions.
}
}
return b
}
func getStatusForBotInfo(bi *swarmingv2.BotInfo) status {
dutState := ""
for _, d := range bi.Dimensions {
switch d.Key {
case "dut_state":
dutState = summarizeValues(d.Value)
default:
// Ignore other dimensions.
}
}
// Order matters: a bot may be dead and still have a task associated with it.
if !isBotHealthy(bi) {
return "[None]"
}
botBusy := bi.TaskId != ""
switch dutState {
case dutstate.Ready.String():
if botBusy {
return "Running"
}
return "Ready"
case "running":
return "Running"
case dutstate.NeedsReset.String():
// We count time spent waiting for a reset task to be assigned as time
// spent Resetting.
return "NeedsReset"
case dutstate.NeedsRepair.String():
// We count time spent waiting for a repair task to be assigned as time
// spent Repairing.
return "NeedsRepair"
case dutstate.RepairFailed.String():
return "RepairFailed"
case dutstate.NeedsDeploy.String():
return "NeedsDeploy"
case dutstate.Deploying.String():
return "Deploying"
case dutstate.Reserved.String():
return "Reserved"
case dutstate.ManualRepair.String():
return "ManualRepair"
case dutstate.NeedsManualRepair.String():
return "NeedsManualRepair"
case dutstate.NeedsReplacement.String():
return "NeedsReplacement"
case dutstate.Unknown.String():
return "Unknown"
default:
return "[None]"
// We should never see this state
}
}
func isBotHealthy(bi *swarmingv2.BotInfo) bool {
return !bi.Deleted && !bi.IsDead && !bi.Quarantined
}
func summarizeValues(vs []string) string {
switch len(vs) {
case 0:
return "[None]"
case 1:
return vs[0]
default:
return "[Multiple]"
}
}
func isManagedPool(p string) bool {
_, ok := invV1.SchedulableLabels_DUTPool_value[p]
return ok
}
func getReportPool(pools []string) string {
p := summarizeValues(pools)
if isManagedPool(p) {
return fmt.Sprintf("managed:%s", p)
}
return p
}