blob: 4ca57353d1c99b630c2ccd34a49157d75bb6d47b [file] [log] [blame]
// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package main
import (
"context"
"fmt"
"net/url"
"time"
"cloud.google.com/go/bigquery"
"cloud.google.com/go/civil"
"google.golang.org/api/iterator"
"google.golang.org/grpc"
"go.chromium.org/luci/auth"
buildbucketpb "go.chromium.org/luci/buildbucket/proto"
"go.chromium.org/luci/common/errors"
"go.chromium.org/luci/common/logging"
"go.chromium.org/luci/grpc/prpc"
"go.chromium.org/luci/luciexe/build"
"infra/cr_builder_health/healthpb"
)
type Row struct {
HealthScore int `bigquery:"health_score"`
ScoreExplanation string `bigquery:"score_explanation"`
Date civil.Date `bigquery:"date"`
Project string `bigquery:"project"`
Bucket string `bigquery:"bucket"`
Builder string `bigquery:"builder"`
Rotation string `bigquery:"rotation"`
ContactTeamEmail string `bigquery:"contact_team_email"`
N int `bigquery:"n"`
Metrics []*Metric `bigquery:"metrics"`
}
type Metric struct {
Type string `bigquery:"type"`
Value float32 `bigquery:"value"`
Threshold float32 `bigquery:"threshold"`
HealthScore int `bigquery:"health_score"`
}
type BBClient interface {
SetBuilderHealth(ctx context.Context, in *buildbucketpb.SetBuilderHealthRequest, opts ...grpc.CallOption) (*buildbucketpb.SetBuilderHealthResponse, error)
}
func generate(ctx context.Context, input *healthpb.InputParams) error {
bqClient, err := setup(ctx, input)
if err != nil {
return errors.Annotate(err, "Setup").Err()
}
defer bqClient.Close()
srcConfigs := make(map[string]SrcConfig)
chromiumSrcConfig, err := getSrcConfig(ctx, "chromium-review.googlesource.com", "chromium.googlesource.com", "chromium/src")
if err == nil {
srcConfigs["chromium"] = *chromiumSrcConfig
}
chromeSrcConfig, err := getSrcConfig(ctx, "chrome-internal-review.googlesource.com", "chrome-internal.googlesource.com", "chrome/src-internal")
if err == nil {
srcConfigs["chrome"] = *chromeSrcConfig
}
if len(srcConfigs) == 0 {
return errors.Annotate(err, "Get Src Config").Err()
}
rows, err := getMetrics(ctx, bqClient, input)
if err != nil {
return errors.Annotate(err, "Get metrics").Err()
}
rowsWithHealthScores, err := calculateIntermediateHealthScores(ctx, rows, srcConfigs)
if err != nil {
return errors.Annotate(err, "Calculate intermediate health scores").Err()
}
rowsWithIndicators, err := calculateIndicators(ctx, input, rowsWithHealthScores, srcConfigs)
if err != nil {
return errors.Annotate(err, "Calculate indicators").Err()
}
err = logIndicators(ctx, rowsWithIndicators)
if err != nil {
return errors.Annotate(err, "Log indicators").Err()
}
if input.DryRun {
return nil
}
client, err := bbClient(ctx)
if err != nil {
return errors.Annotate(err, "Make BB client").Err()
}
err = rpcBuildbucket(ctx, rowsWithIndicators, client)
if err != nil {
return errors.Annotate(err, "RPC buildbucket").Err()
}
// Write out to BQ
if err = writeIndicators(ctx, bqClient, rowsWithIndicators); err != nil {
return errors.Annotate(err, "Write indicators").Err()
}
return nil
}
func setup(buildCtx context.Context, input *healthpb.InputParams) (*bigquery.Client, error) {
var err error
step, _ := build.StartStep(buildCtx, "Setup")
defer func() { step.End(err) }()
step.SetSummaryMarkdown(fmt.Sprintf("Date is %s", input.Date.AsTime().String()))
bqClient, err := bigquery.NewClient(buildCtx, "cr-builder-health-indicators")
if err != nil {
return nil, errors.Annotate(err, "Initializing BigQuery client").Err()
}
return bqClient, nil
}
func getMetrics(buildCtx context.Context, bqClient *bigquery.Client, input *healthpb.InputParams) ([]Row, error) {
var err error
step, ctx := build.StartStep(buildCtx, "Get metrics")
defer func() { step.End(err) }()
q := bqClient.Query(`
SELECT
DATE(b.create_time) as date,
b.builder.project,
b.builder.bucket,
b.builder.builder,
IFNULL(JSON_VALUE_ARRAY(ANY_VALUE(b.input.properties), '$.sheriff_rotations')[OFFSET(0)], '') as rotation,
COUNT(*) as n,
[
STRUCT('fail_rate' as type, COUNTIF(b.status = 'FAILURE') / COUNT(*) as value),
STRUCT('infra_fail_rate' as type, COUNTIF(b.status = 'INFRA_FAILURE') / COUNT(*) as value),
STRUCT('pending_mins_p50' as type, IFNULL(APPROX_QUANTILES(TIMESTAMP_DIFF(start_time, create_time, SECOND), 100)[OFFSET(50)]/60, 0) as value),
STRUCT('pending_mins_p95' as type, IFNULL(APPROX_QUANTILES(TIMESTAMP_DIFF(start_time, create_time, SECOND), 100)[OFFSET(95)]/60, 0) as value),
STRUCT('build_mins_p50' as type, IFNULL(APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, start_time, SECOND), 100)[OFFSET(50)]/60, 0) as value),
STRUCT('build_mins_p95' as type, IFNULL(APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, start_time, SECOND), 100)[OFFSET(95)]/60, 0) as value)
] as metrics
` +
"FROM `cr-buildbucket.chrome.builds` as b" + `
WHERE
b.create_time < @input_date
AND b.create_time >= TIMESTAMP_SUB(@input_date, INTERVAL 90 DAY)
AND b.builder.bucket = 'ci'
AND b.builder.project IN ('chromium', 'chrome')
GROUP BY
b.builder.project, b.builder.bucket, b.builder.builder, date
ORDER BY
rotation,
LOWER(builder) ASC,
date DESC
`)
q.Parameters = []bigquery.QueryParameter{
{
Name: "input_date",
Value: input.Date.AsTime().UTC().Format(iso8601Format),
},
}
it, err := q.Read(ctx)
if err != nil {
return nil, errors.Annotate(err, "BQ query").Err()
}
var rows []Row
for {
var row Row
err := it.Next(&row)
if err == iterator.Done {
break
}
if err != nil {
logging.Errorf(ctx, "Partially parsed bad row: %+v", row)
return nil, errors.Annotate(err, "Row iterator").Err()
}
logging.Debugf(ctx, "%+v", row)
rows = append(rows, row)
}
step.SetSummaryMarkdown(fmt.Sprintf("Queried %d builders", len(rows)))
return rows, nil
}
func calculateIntermediateHealthScores(buildCtx context.Context, rows []Row, srcConfigs map[string]SrcConfig) ([]Row, error) {
var stepErr error
step, ctx := build.StartStep(buildCtx, "Calculate intermediate health scores")
defer func() { step.End(stepErr) }()
// Log an error only once when the src config is missing for a builder
srcConfigNotFound := make(map[string]bool)
failedBuilders := 0
for i, row := range rows {
if srcConfig, ok := srcConfigs[row.Project]; !ok {
rows[i].HealthScore = UNSET_SCORE
if !srcConfigNotFound[builderID(row.Project, "", "")] {
logging.Errorf(ctx, "Src Config not found for project: %s", row.Project)
srcConfigNotFound[builderID(row.Project, "", "")] = true
}
continue
} else if bucketSpec, ok := srcConfig.BucketSpecs[row.Bucket]; !ok {
rows[i].HealthScore = UNSET_SCORE
if !srcConfigNotFound[builderID(row.Project, row.Bucket, "")] {
logging.Errorf(ctx, "Src Config not found for project: %s, bucket: %s", row.Project, row.Bucket)
srcConfigNotFound[builderID(row.Project, row.Bucket, "")] = true
}
continue
} else if builderSpec, ok := bucketSpec[row.Builder]; !ok {
rows[i].HealthScore = UNSET_SCORE
if !srcConfigNotFound[builderID(row.Project, row.Bucket, row.Builder)] {
logging.Errorf(ctx, "Src Config not found for project: %s, bucket: %s, builder: %s", row.Project, row.Bucket, row.Builder)
srcConfigNotFound[builderID(row.Project, row.Bucket, row.Builder)] = true
}
continue
} else {
if len(builderSpec.ProblemSpecs) == 0 {
rows[i].HealthScore = UNSET_SCORE
rows[i].ScoreExplanation = fmt.Sprintf("Src Config error: Bucket: %s, Builder: %s has no ProblemSpecs", row.Bucket, row.Builder)
logging.Errorf(ctx, "Src Config error: Bucket: %s, Builder: %s has no ProblemSpecs", row.Bucket, row.Builder)
failedBuilders += 1
continue
}
sortProblemSpecs(builderSpec.ProblemSpecs) // to give lower scores precedence
for _, problemSpec := range builderSpec.ProblemSpecs {
if problemSpec.Thresholds.Default == "_default" {
// default set, check if any other thresholds erroneously specified
if (problemSpec.Thresholds.BuildTime != PercentileThresholds{} ||
problemSpec.Thresholds.TestPendingTime != PercentileThresholds{} ||
problemSpec.Thresholds.PendingTime != PercentileThresholds{} ||
problemSpec.Thresholds.FailRate != AverageThresholds{} ||
problemSpec.Thresholds.InfraFailRate != AverageThresholds{}) {
rows[i].HealthScore = UNSET_SCORE
rows[i].ScoreExplanation = "Threshold config error: default sentinel and custom thresholds cannot both be set."
logging.Errorf(ctx, "%s Bucket: %s. Builder: %s.", rows[i].ScoreExplanation, row.Bucket, row.Builder)
failedBuilders += 1
continue
}
// _default set, look for a matching default spec
found := false
for _, defaultSpec := range srcConfig.DefaultSpecs {
if defaultSpec.Name == problemSpec.Name {
problemSpec.Thresholds = defaultSpec.Thresholds
found = true
break
}
}
if !found {
rows[i].HealthScore = UNSET_SCORE
rows[i].ScoreExplanation = "Threshold config error: default sentinel but no matching default found"
logging.Errorf(ctx, "%s Bucket: %s. Builder: %s.", rows[i].ScoreExplanation, row.Bucket, row.Builder)
failedBuilders += 1
continue
}
} else if problemSpec.Thresholds.Default != "" {
// Unknown sentinel
rows[i].HealthScore = UNSET_SCORE
rows[i].ScoreExplanation = fmt.Sprintf("Threshold config error: Default set to unknown sentinel value: %s.", problemSpec.Thresholds.Default)
logging.Errorf(ctx, "%s Bucket: %s. Builder %s.", rows[i].ScoreExplanation, row.Bucket, row.Builder)
failedBuilders += 1
continue
}
// Happy path, compare thresholds
stepErr = errors.Join(stepErr, compareThresholds(ctx, &rows[i], &problemSpec))
}
rows[i].ContactTeamEmail = builderSpec.ContactTeamEmail
}
}
if failedBuilders > 0 {
stepErr = errors.Join(stepErr, fmt.Errorf("Indicator calculation failed for %d builders", failedBuilders))
}
return rows, stepErr
}
func builderID(project string, bucket string, builder string) string {
return project + "/" + bucket + "/" + builder
}
func isWeekend(date civil.Date) bool {
time, _ := time.Parse(time.RFC3339, date.String()+"T00:00:00Z")
return time.Weekday() == 0 || time.Weekday() == 6
}
func calculateIndicators(buildCtx context.Context, input *healthpb.InputParams, rows []Row, srcConfigs map[string]SrcConfig) ([]Row, error) {
var stepErr error
step, ctx := build.StartStep(buildCtx, "Calculate indicators")
defer func() { step.End(stepErr) }()
mostRecentRows := make(map[string]Row)
for _, row := range rows {
if isWeekend(row.Date) {
continue
}
var builderID = builderID(row.Project, row.Bucket, row.Builder)
mostRecentRow, ok := mostRecentRows[builderID]
if !ok {
// As rows are sorted by date in descending order, this row represents the most recent date
mostRecentRow = row
}
if srcConfig, ok := srcConfigs[row.Project]; !ok {
continue
} else if bucketSpec, ok := srcConfig.BucketSpecs[row.Bucket]; !ok {
continue
} else if builderSpec, ok := bucketSpec[row.Builder]; !ok {
continue
} else {
for _, problemSpec := range builderSpec.ProblemSpecs {
var periodDays = problemSpec.PeriodDays
diffDate := civil.DateOf(input.Date.AsTime().UTC()).DaysSince(row.Date)
if diffDate > periodDays {
continue
}
// Let period_days of the unhealthy spec and low-value spec be 7
// and 90, respectively, and their scores be 5 and 1,
// respectively.
// If any score in the last 7 days is greater than 5, the
// builder is considered healthy.
// If any score in the last 90 days is greater than 1, the
// builder is considered unhealthy.
// Otherwise, the builder is considered low-value.
mostRecentRow.HealthScore = max(mostRecentRow.HealthScore, row.HealthScore)
mostRecentRows[builderID] = mostRecentRow
}
}
}
rowsWithIndicators := make([]Row, 0, len(mostRecentRows))
inactiveBuilders := 0
healthyBuilders := 0
unhealthyBuilders := 0
lowValueBuilders := 0
for builderID, row := range mostRecentRows {
row.ScoreExplanation = scoreExplanation(row, srcConfigs)
rowsWithIndicators = append(rowsWithIndicators, row)
if row.HealthScore > 5 {
healthyBuilders += 1
} else if row.HealthScore > 1 {
unhealthyBuilders += 1
logging.Errorf(ctx, "Unhealthy builders: %s", builderID)
} else if row.HealthScore > 0 {
lowValueBuilders += 1
logging.Errorf(ctx, "Low-value builders: %s", builderID)
} else {
inactiveBuilders += 1
}
}
logging.Errorf(ctx, "Total healthy builders: %d", healthyBuilders)
logging.Errorf(ctx, "Total unhealthy builders: %d", unhealthyBuilders)
logging.Errorf(ctx, "Total low-value builders: %d", lowValueBuilders)
logging.Errorf(ctx, "Total inactive builders: %d", inactiveBuilders)
return rowsWithIndicators, nil
}
func scoreExplanation(row Row, srcConfigs map[string]SrcConfig) string {
// Second pass to insert just the worst problem's ScoreExplanation
if srcConfig, ok := srcConfigs[row.Project]; ok {
if bucketSpec, ok := srcConfig.BucketSpecs[row.Bucket]; ok {
if builderSpec, ok := bucketSpec[row.Builder]; ok {
for _, problemSpec := range builderSpec.ProblemSpecs {
if problemSpec.Score == row.HealthScore {
explanation := ""
for _, metric := range row.Metrics {
if metric.HealthScore != 0 {
if explanation != "" {
explanation += "#013;"
}
explanation += fmt.Sprintf("%s of %.2f exceeded the threshold of %.2f for %d days", metric.Type, metric.Value, metric.Threshold, problemSpec.PeriodDays)
}
}
return explanation
}
}
}
}
}
return ""
}
func bbClient(buildCtx context.Context) (buildbucketpb.BuildersClient, error) {
var err error
step, _ := build.StartStep(buildCtx, "Make BB client")
defer func() { step.End(err) }()
authenticator := auth.NewAuthenticator(buildCtx, auth.SilentLogin, auth.Options{})
httpClient, err := authenticator.Client()
if err != nil {
return nil, errors.Annotate(err, "Initializing Auth").Err()
}
return buildbucketpb.NewBuildersPRPCClient(&prpc.Client{
C: httpClient,
Host: "cr-buildbucket.appspot.com",
}), nil
}
func rpcBuildbucket(buildCtx context.Context, rows []Row, client BBClient) error {
var err error
step, ctx := build.StartStep(buildCtx, "RPC Buildbucket")
defer func() { step.End(err) }()
healthProtos := make([]*buildbucketpb.SetBuilderHealthRequest_BuilderHealth, len(rows), len(rows))
for i, row := range rows {
simplifiedMetrics := map[string]float32{}
for _, metric := range row.Metrics {
simplifiedMetrics[metric.Type] = metric.Value
}
dashboardLink := fmt.Sprintf("http://go/builder-health-indicators?f=builder:in:%s&f=bucket:in:%s&f=project:in:%s",
url.QueryEscape(row.Builder), url.QueryEscape(row.Bucket), url.QueryEscape(row.Project))
const designDocLink = "http://go/builder-health-metrics-design"
healthProtos[i] = &buildbucketpb.SetBuilderHealthRequest_BuilderHealth{
Id: &buildbucketpb.BuilderID{Project: row.Project, Bucket: row.Bucket, Builder: row.Builder},
Health: &buildbucketpb.HealthStatus{
HealthScore: int64(row.HealthScore),
HealthMetrics: simplifiedMetrics,
Description: row.ScoreExplanation,
DocLinks: map[string]string{
"": "https://chromium.googlesource.com/chromium/src/+/refs/heads/main/infra/config/generated/health-specs/health-specs.json",
"google.com": designDocLink,
"chromium.org": designDocLink,
},
DataLinks: map[string]string{
"": "https://chromium.googlesource.com/chromium/src/+/refs/heads/main/infra/config/generated/health-specs/health-specs.json",
"google.com": dashboardLink,
"chromium.org": dashboardLink,
},
},
}
}
req := &buildbucketpb.SetBuilderHealthRequest{
Health: healthProtos,
}
res, err := client.SetBuilderHealth(ctx, req)
if err != nil {
logging.Errorf(ctx, "Set builder health error result: %+v. Error: %s", res, err)
return errors.Annotate(err, "Set builder health").Err()
}
nErrors := 0
for _, resp := range res.Responses {
if resp.GetError() == nil {
continue
}
nErrors += 1
logging.Errorf(ctx, "Set builder health error: %s.", resp.GetError().String())
}
if nErrors > 0 {
step.SetSummaryMarkdown(fmt.Sprintf("%d set builder health requests failed", nErrors))
}
return nil
}
func writeIndicators(buildCtx context.Context, bqClient *bigquery.Client, rows []Row) error {
var err error
step, ctx := build.StartStep(buildCtx, "Write indicators")
defer func() { step.End(err) }()
step.SetSummaryMarkdown("Writing to BQ table cr-builder-health-indicators.builder_health_indicators.builder-health-indicators")
inserter := bqClient.Dataset("builder_health_indicators").Table("builder-health-indicators").Inserter()
if err := inserter.Put(ctx, rows); err != nil {
return err
}
return nil
}
func logIndicators(buildCtx context.Context, rowsWithIndicators []Row) error {
var stepErr error
step, ctx := build.StartStep(buildCtx, "Print indicators")
defer func() { step.End(stepErr) }()
for _, row := range rowsWithIndicators {
logging.Errorf(ctx, "%s/%s/%s: HealthScore: %d.", row.Project, row.Bucket, row.Builder, row.HealthScore)
for _, metric := range row.Metrics {
logging.Errorf(ctx, "%+v", *metric)
}
logging.Errorf(ctx, "")
}
return nil
}