blob: 1b49acad590fd78a619476900dbdcba227c56c5a [file] [log] [blame] [edit]
// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package metrics
import (
"context"
"fmt"
"strings"
"time"
"go.chromium.org/luci/common/errors"
"go.chromium.org/infra/libs/skylab/buildbucket"
)
// ActionStatus is the status of an action.
type ActionStatus string
// AllowFail is whether failure is allowed.
type AllowFail string
const (
// ActionStatusUnspecified is an unknown status.
ActionStatusUnspecified ActionStatus = ""
// ActionStatusSuccess represents a successful action.
ActionStatusSuccess ActionStatus = "success"
// ActionStatusFail represents a failed action.
ActionStatusFail ActionStatus = "fail"
// ActionStatusSkip represents a skipped action.
// TODO(gregorynisbet): Add support for skipped actions to Karte OR record the number of skipped actions as a plan-level observation.
ActionStatusSkip ActionStatus = "skip"
AllowFailUnspecified AllowFail = ""
YesAllowFail AllowFail = "allow-fail"
NoAllowFail AllowFail = "no-allow-fail"
)
// A ValueType is the type of an observation, such as a number or a string.
type ValueType string
const (
// ValueTypeUnspecified is an unknown value type.
ValueTypeUnspecified ValueType = ""
// ValueTypeString represents a string-valued measurement.
ValueTypeString ValueType = "string"
// ValueTypeNumber represents a real-valued measurement.
ValueTypeNumber ValueType = "number"
)
// ActionType describes the use of the action within the plan.
type ActionType string
const (
ActionTypeUnspecified = ""
ActionTypeVerifier = "verifier"
ActionTypeCondition = "condition"
ActionTypeRecovery = "recovery"
FailureTypeUnspecified = ""
FailureTypeProvisioning = "provisioning"
FailureTypeRepair = "repair"
)
// Action is an event performed on a DUT.
// TODO(gregorynisbet): Rename an action to something else so we don't collide with the other notion of an action.
type Action struct {
// Name is the identifier for an action. It is controlled by Karte.
Name string
// ActionKind is a coarse-grained type of observation e.g. "ssh".
ActionKind string
// SwarmingTaskID is the ID of the associated swarming task.
SwarmingTaskID string
// BuildbucketID is the ID of the buildbucket build.
BuildbucketID string
// Board is the board of the device.
Board string
// Model is the model of the device.
Model string
// AssetTag is the asset tag of the DUT that the observation is recorded for.
AssetTag string
// StartTime is when the event started.
StartTime time.Time
// StopTime is when the event ended.
StopTime time.Time
// Status is whether the event was successful, failed, or unknown.
Status ActionStatus
// Hostname is the hostname of the device or the name of the unit.
Hostname string
// FailReason is an error message with information describing the failure.
FailReason string
// Observations are the observations associated with the current observation.
Observations []*Observation
// Recovered by is the name of the action that recovered us.
RecoveredBy string
// Restarts is how many times we have re-traversed the plan.
Restarts int32
// Set whether failures are allowed or not
AllowFail AllowFail
// Plan name is the name of the currently-executing plan.
PlanName string
// Action type is "entrypoint", "critical", "recovery", or "condition".
Type ActionType
}
// UpdateStatus updates status of the action and error reason if error was provided.
func (a *Action) UpdateStatus(err error) {
if err != nil {
a.Status = ActionStatusFail
a.FailReason = err.Error()
} else if a.Status != ActionStatusSkip {
// Don't override skip status.
a.Status = ActionStatusSuccess
}
}
// Observation is the type of a measurement associated with an event performed on a DUT.
type Observation struct {
// MetricKind is the metric kind (e.g. battery percentage).
MetricKind string
// ValueType is the type of value (e.g. String).
ValueType ValueType
// Value is the value itself.
Value string
}
// FailureType describes the type of the failure that occurred on the DUT.
type FailureType string
// Failure is an event that occurred on a DUT during the recovery process.
type Failure struct {
// Name is the identifier for an failure. It is controlled by Karte.
Name string
// SwarmingTaskID is the ID of the associated swarming task.
SwarmingTaskID string
// BuildbucketID is the ID of the buildbucket build.
BuildbucketID string
// FailureType is a coarse-grained type of observation e.g. "ssh".
FailureType FailureType
// DeviceId is the unique identifier of the device in inventory.
DeviceId string
// DeviceName is the resource name for the device.
DeviceName string
// DeviceState is the state of the DUT at the time of the failure.
DeviceState string
// DeviceModel is the model of the device this event applies to.
DeviceModel string
// DeviceBoard is the board of the device .
DeviceBoard string
// DeviceOsBuild is the build version in the device at the time of the failure;
DeviceOsBuild string
// DeviceOsType is the type of the os at the time of the failure;
DeviceOsType string
//FailureTime is the time when the failure occurred
FailureTime time.Time
}
// NewFloat64Observation produces a new float-valued observation of the given kind.
func NewFloat64Observation(kind string, value float64) *Observation {
return &Observation{
MetricKind: kind,
ValueType: ValueTypeNumber,
Value: fmt.Sprintf("%f", value),
}
}
// NewInt64Observation produces a new int-valued observation of the given kind.
func NewInt64Observation(kind string, value int64) *Observation {
return &Observation{
MetricKind: kind,
ValueType: ValueTypeNumber,
Value: fmt.Sprintf("%d", value),
}
}
// NewStringObservation produces a new string-valued observation of the given kind.
func NewStringObservation(kind string, value string) *Observation {
return &Observation{
MetricKind: kind,
ValueType: ValueTypeString,
Value: value,
}
}
// A Query is a collection of time-bounded search criteria for actions on DUTs.
type Query struct {
// StartTime is the starting time for the query as a unix timestamp.
StartTime time.Time
// StopTime is the ending time for the query as a unix timestamp.
StopTime time.Time
// AssetTag is the asset tag for the DUT in question.
AssetTag string
// Hostname is the hostname for the DUT in question.
// The hostname is less reliable than the asset tag because
// it identifies a location rather than a device per se.
Hostname string
// Kind filters the actions by the "ActionKind" field.
ActionKind string
// Limit imposes a limit on the total number of actions returned.
Limit int
// PageToken is an opaque blob of data that is used to start the query at a specific point.
PageToken string
// OrderDescending controls how the result set should be ordered by time
OrderDescending bool
}
// Lower takes a query and lowers it to a string using the filter syntax that Karte accepts.
// See karte/api/filter_syntax.md for more information.
func (q *Query) Lower() (string, error) {
if q == nil {
return "", nil
}
var out []string
// Keep this list of if-statements up-to-date with the
if !q.StartTime.IsZero() {
return "", errors.Reason("lower: not yet implemented").Err()
}
if !q.StopTime.IsZero() {
return "", errors.Reason("lower: not yet implemented").Err()
}
if q.AssetTag != "" {
return "", errors.Reason("lower: not yet implemented").Err()
}
if q.Hostname != "" {
out = append(out, fmt.Sprintf(`hostname == %q`, q.Hostname))
}
if q.ActionKind != "" {
out = append(out, fmt.Sprintf(`kind == %q`, q.ActionKind))
}
// q.Limit is intentionally ignored for the purposes of generating a query.
if q.PageToken != "" {
return "", errors.Reason("lower: not yet implemented").Err()
}
filter := strings.Join(out, " && ")
return filter, nil
}
// NewLastActionQuery returns a query for the last record of a given kind for the asset in question.
func NewLastActionQuery(assetTag string, kind string) *Query {
return &Query{
AssetTag: assetTag,
ActionKind: kind,
Limit: 1,
}
}
// NewLastActionBeforeTimeQuery returns a query for the last record before the stop time of a given kind
// for the asset in question.
func NewLastActionBeforeTimeQuery(assetTag string, kind string, stopTime time.Time) *Query {
return &Query{
AssetTag: assetTag,
ActionKind: kind,
Limit: 1,
StopTime: stopTime,
}
}
// NewListActionsInRangeQuery lists the actions for a given asset and given range in order.
//
// Sample usage:
//
// q := NewListActionsInRangeQuery(..., "token1", 10)
// res, err := metrics.Search(ctx, q)
// if err != nil {
// ...
// }
// q = NewListActionsInRangeQuery(..., res.PageToken, 10)
// res, err = metrics.Search(ctx, q)
// ...
func NewListActionsInRangeQuery(assetTag string, kind string, startTime time.Time, stopTime time.Time, pageToken string, limit int) *Query {
return &Query{
AssetTag: assetTag,
ActionKind: kind,
StartTime: startTime,
StopTime: stopTime,
PageToken: pageToken,
}
}
// A QueryResult is the result of running a query.
type QueryResult struct {
// Actions are the actions satisfying the criteria in question.
Actions []*Action
// PageToken is the token for resuming the query, if such a token exists.
PageToken string
}
// MetricSaver a function to provide contextless saver of metrics.
type MetricSaver func(action *Action) error
// Metrics is a simple interface for logging
// structured events and metrics.
type Metrics interface {
// Create takes an action and creates it on the Karte side.
// On success, it updates its action argument to reflect the Karte state.
// Local versions of Create should emulate this.
Create(ctx context.Context, action *Action) error
// RegisterFailure takes a failure and creates it on the Karte side.
RegisterFailure(ctx context.Context, failure *Failure) error
// Search lists all the actions matching a set of constraints, up to
// a limit on the number of returned actions.
Search(ctx context.Context, q *Query) (*QueryResult, error)
}
// CountFailedRepairFromMetrics determines the number of failed PARIS repair task
// since the last successful PARIS repair task.
//
// An empty taskName means do not filter based on the task name.
func CountFailedRepairFromMetrics(ctx context.Context, dutName string, taskName string, limit int, metricsService Metrics) (int, error) {
if metricsService == nil {
return 0, errors.Reason("count failed repair from karte: karte metric has not been initialized").Err()
}
//TODO(gregorynisbet): When karte's Search API is capable of taking in asset tag,
// change the query to use asset tag instead of using hostname.
karteQuery := &Query{Hostname: dutName}
if taskName != "" {
karteQuery.ActionKind = TasknameToMetricsKind(taskName)
}
if limit > 0 {
karteQuery.Limit = limit
}
queryRes, err := metricsService.Search(ctx, karteQuery)
if err != nil {
return 0, errors.Annotate(err, "count failed repair from karte").Err()
}
matchedQueryResCount := len(queryRes.Actions)
if matchedQueryResCount == 0 {
return 0, nil
}
var failedRepairCount int
for i := range matchedQueryResCount {
if queryRes.Actions[i].Status == ActionStatusSuccess {
// since we are counting the number of failed repair tasks after last successful task.
// when we are encountering the successful record,that mean we reached latest success task
// and we need stop counting it.
break
}
failedRepairCount += 1
}
return failedRepairCount, nil
}
// TasknameToMetricsKind returns a Karte action kind based on taskname.
func TasknameToMetricsKind(tn string) string {
switch buildbucket.TaskName(tn) {
case buildbucket.Recovery, buildbucket.MHRecovery, buildbucket.DeepRecovery:
// Normal repair and deep repair shares a same set of metrics(e.g. failure count).
return fmt.Sprintf(PerResourceTaskKindGlob, buildbucket.Recovery)
default:
return fmt.Sprintf(PerResourceTaskKindGlob, tn)
}
}