blob: 4764b780e11fd8125aab3c50b3c8749e7388a70d [file] [log] [blame]
// Copyright 2022 The LUCI Authors.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
// Package metrics handles sending metrics to tsmon.
package metrics
import (
buildbucketpb ""
pb ""
var (
// Measure how many analyses are currently running
runningAnalysesGauge = metric.NewInt(
"The total number running compile analysis, by LUCI project.",
&types.MetricMetadata{Units: "analyses"},
// The LUCI Project.
// Measure how many rerun builds are currently running
runningRerunGauge = metric.NewInt(
"The number of running rerun builds, by LUCI project.",
&types.MetricMetadata{Units: "reruns"},
// The LUCI Project.
// "running", "pending"
// "mac", "windows", "linux"
// Measure the "age" of running rerun builds
rerunAgeMetric = metric.NewNonCumulativeDistribution(
"The age of running reruns, by LUCI project.",
&types.MetricMetadata{Units: "seconds"},
// The LUCI Project.
// "running", "pending"
// "mac", "windows", "linux"
// rerunKey is keys for maps for runningRerunGauge and rerunAgeMetric
type rerunKey struct {
Project string
Status string
Platform string
func init() {
// Register metrics as global metrics, which has the effort of
// resetting them after every flush.
tsmon.RegisterGlobalCallback(func(ctx context.Context) {
// Do nothing -- the metrics will be populated by the cron
// job itself and does not need to be triggered externally.
}, runningAnalysesGauge, runningRerunGauge, rerunAgeMetric)
// CollectGlobalMetrics is called in a cron job.
// It collects global metrics and send to tsmon.
func CollectGlobalMetrics(c context.Context) error {
var errs []error
err := collectMetricsForRunningAnalyses(c)
if err != nil {
err = errors.Annotate(err, "collectMetricsForRunningAnalyses").Err()
errs = append(errs, err)
logging.Errorf(c, err.Error())
err = collectMetricsForRunningReruns(c)
if err != nil {
err = errors.Annotate(err, "collectMetricsForRunningReruns").Err()
errs = append(errs, err)
logging.Errorf(c, err.Error())
if len(errs) > 0 {
return errors.NewMultiError(errs...)
return nil
func collectMetricsForRunningAnalyses(c context.Context) error {
runningCount, err := retrieveRunningAnalyses(c)
if err != nil {
return err
// Set the metric
for proj, count := range runningCount {
runningAnalysesGauge.Set(c, int64(count), proj)
return nil
func retrieveRunningAnalyses(c context.Context) (map[string]int, error) {
q := datastore.NewQuery("CompileFailureAnalysis").Eq("run_status", pb.AnalysisRunStatus_STARTED)
analyses := []*model.CompileFailureAnalysis{}
err := datastore.GetAll(c, q, &analyses)
if err != nil {
return nil, errors.Annotate(err, "couldn't get running analyses").Err()
// To store the running analyses for each project
runningCount := map[string]int{}
for _, cfa := range analyses {
build, err := datastoreutil.GetBuild(c, cfa.CompileFailure.Parent().IntID())
if err != nil {
return nil, errors.Annotate(err, "getting build for analysis %d", cfa.Id).Err()
if build == nil {
return nil, fmt.Errorf("getting build for analysis %d", cfa.Id)
runningCount[build.Project] = runningCount[build.Project] + 1
return runningCount, nil
func collectMetricsForRunningReruns(c context.Context) error {
// Query all in-progress single reruns in the last 7 days.
// We set the limit to 7 days because there maybe cases that for some reasons
// (e.g. crashes) that a rerun status may not be updated.
// Any reruns more than 7 days are surely canceled by buildbucket, so it is
// safe to exclude them.
cutoffTime := clock.Now(c).Add(-time.Hour * 7 * 24)
q := datastore.NewQuery("SingleRerun").Eq("Status", pb.RerunStatus_RERUN_STATUS_IN_PROGRESS).Gt("create_time", cutoffTime)
reruns := []*model.SingleRerun{}
err := datastore.GetAll(c, q, &reruns)
if err != nil {
return errors.Annotate(err, "couldn't get running reruns").Err()
// Get the metrics for rerun count and rerun age
// Maps where each key is one project-status-platform combination
rerunCountMap := map[rerunKey]int64{}
rerunAgeMap := map[rerunKey]*distribution.Distribution{}
for _, rerun := range reruns {
proj, platform, err := projectAndPlatformForRerun(c, rerun)
if err != nil {
return errors.Annotate(err, "projectForRerun %d", rerun.Id).Err()
rerunBuild := &model.CompileRerunBuild{
Id: rerun.RerunBuild.IntID(),
err = datastore.Get(c, rerunBuild)
if err != nil {
return errors.Annotate(err, "couldn't get rerun build %d", rerun.RerunBuild.IntID()).Err()
var key = rerunKey{
Project: proj,
Platform: platform,
if rerunBuild.Status == buildbucketpb.Status_STATUS_UNSPECIFIED || rerunBuild.Status == buildbucketpb.Status_SCHEDULED {
key.Status = "pending"
if rerunBuild.Status == buildbucketpb.Status_STARTED {
key.Status = "running"
if key.Status != "" {
rerunCountMap[key] = rerunCountMap[key] + 1
if _, ok := rerunAgeMap[key]; !ok {
rerunAgeMap[key] = distribution.New(rerunAgeMetric.Bucketer())
rerunAgeMap[key].Add(rerunAgeInSeconds(c, rerun))
// Send metrics to tsmon
for k, count := range rerunCountMap {
runningRerunGauge.Set(c, count, k.Project, k.Status, k.Platform)
for k, dist := range rerunAgeMap {
rerunAgeMetric.Set(c, dist, k.Project, k.Status, k.Platform)
return nil
func projectAndPlatformForRerun(c context.Context, rerun *model.SingleRerun) (string, string, error) {
cfa, err := datastoreutil.GetCompileFailureAnalysis(c, rerun.Analysis.IntID())
if err != nil {
return "", "", err
build, err := datastoreutil.GetBuild(c, cfa.CompileFailure.Parent().IntID())
if err != nil {
return "", "", errors.Annotate(err, "getting build for analysis %d", cfa.Id).Err()
if build == nil {
return "", "", fmt.Errorf("build for analysis %d does not exist", cfa.Id)
return build.Project, string(build.Platform), nil
func rerunAgeInSeconds(c context.Context, rerun *model.SingleRerun) float64 {
dur := clock.Now(c).Sub(rerun.CreateTime)
return dur.Seconds()