blob: 56dbfdbc2a744f88ee17bddbcdc6b882f5160fa9 [file] [log] [blame]
// Copyright 2020 The LUCI Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package metrics
import (
"math"
"go.chromium.org/luci/common/tsmon/distribution"
"go.chromium.org/luci/common/tsmon/field"
"go.chromium.org/luci/common/tsmon/metric"
"go.chromium.org/luci/common/tsmon/types"
)
// MaxRetryFieldValue is the number to cap the value of retry count field at,
// for metrics that include it, thus indicating a value greater or equal to it.
// This makes the metric field have a reasonable number of distinct values.
const MaxRetryFieldValue = 10
var (
// bucketer1msTo5min covers range of 1..300k.
bucketer1msTo5min = distribution.GeometricBucketer(math.Pow(10, 0.055), 100)
// TQ guts metrics, primary useful to debug TQ.
InprocSweepDurationMS = metric.NewCumulativeDistribution(
"tq/sweep/inproc/duration",
"Duration of a full inproc sweep cycle across all DBs (ms)",
&types.MetricMetadata{Units: types.Milliseconds},
bucketer1msTo5min,
)
SweepFetchMetaDurationsMS = metric.NewCumulativeDistribution(
"tq/sweep/fetch/meta/durations",
"Duration of FetchRemindersMeta operation (ms)",
&types.MetricMetadata{Units: types.Milliseconds},
bucketer1msTo5min,
field.String("status"), // OK | limit | timeout | failures
field.Int("level"), // 0 means the primary shard task, 1+ are its children
field.String("db"),
)
SweepFetchMetaReminders = metric.NewCounter(
"tq/sweep/fetch/meta/reminders",
"Count of Reminders fetched by FetchRemindersMeta",
nil,
field.String("status"), // OK | limit | timeout | failures
field.Int("level"), // 0 means the primary shard task, 1+ are its children
field.String("db"),
)
ReminderStalenessMS = metric.NewCumulativeDistribution(
"tq/reminders/staleness",
("Distribution of staleness of scanned Reminders during the sweep. " +
"May be incomplete if keyspace wasn't scanned completely"),
&types.MetricMetadata{Units: types.Milliseconds},
distribution.DefaultBucketer,
field.Int("level"),
field.String("db"),
)
RemindersCreated = metric.NewCounter(
"tq/reminders/created",
"Count of reminders created and if they are still fresh in the post-txn defer",
nil,
field.String("task_class"), // matches TaskClass.ID
field.String("staleness"), // fresh | stale
field.String("db"),
)
RemindersDeleted = metric.NewCounter(
"tq/reminders/processed",
"Count of reminders processed (i.e. deleted)",
nil,
field.String("task_class"), // matches TaskClass.ID
field.String("txn_path"), // happy | sweep
field.String("db"),
)
RemindersLatencyMS = metric.NewCumulativeDistribution(
"tq/reminders/latency",
"Time between AddTask call and the deletion of the reminder",
&types.MetricMetadata{Units: types.Milliseconds},
bucketer1msTo5min,
field.String("task_class"), // matches TaskClass.ID
field.String("txn_path"), // happy | sweep
field.String("db"),
)
// TQ metrics that might be useful for TQ clients as well.
SubmitCount = metric.NewCounter(
"tq/submit/count",
"Count of submitted tasks",
nil,
field.String("task_class"), // matches TaskClass.ID
field.String("txn_path"), // none | happy | sweep
field.String("grpc_code"), // gRPC canonical code
)
SubmitDurationMS = metric.NewCumulativeDistribution(
"tq/submit/duration",
"Duration of submit calls",
&types.MetricMetadata{Units: types.Milliseconds},
distribution.DefaultBucketer,
field.String("task_class"), // matches TaskClass.ID
field.String("txn_path"), // none | happy | sweep
field.String("grpc_code"), // gRPC canonical code
)
ServerRejectedCount = metric.NewCounter(
"tq/server/rejected",
"Count of rejected (e.g. malformed) task pushes",
nil,
field.String("reason"), // auth | bad_request | unknown_class | no_handler | bad_payload
)
ServerHandledCount = metric.NewCounter(
"tq/server/handled",
"Count of handled non-rejected tasks",
nil,
field.String("task_class"), // matches TaskClass.ID
field.String("result"), // OK | retry | transient | fatal
field.Int("retry"), // 0 for first try, incrementing until cap.
)
ServerDurationMS = metric.NewCumulativeDistribution(
"tq/server/duration",
"Duration of handling of non-rejected tasks",
&types.MetricMetadata{Units: types.Milliseconds},
distribution.DefaultBucketer,
field.String("task_class"), // matches TaskClass.ID
field.String("result"), // OK | retry | transient | fatal
)
ServerTaskLatency = metric.NewCumulativeDistribution(
"tq/server/latency",
"Time between task's expected ETA and actual completion",
&types.MetricMetadata{Units: types.Milliseconds},
distribution.DefaultBucketer,
field.String("task_class"), // matches TaskClass.ID
field.String("result"), // OK | retry | transient | fatal
field.Int("retry"), // 0 for first try, incrementing until cap.
)
ServerRunning = metric.NewInt(
"tq/server/running",
"Number of task handlers currently running",
nil,
field.String("task_class"), // matches TaskClass.ID
)
)