blob: 8e663b1d9c49491e1ce31bd1a0acb6cae80bc57c [file] [log] [blame]
// Copyright 2016 The LUCI Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"bytes"
"context"
"encoding/json"
"fmt"
"time"
"google.golang.org/grpc/codes"
"go.chromium.org/luci/common/logging/memlogger"
"go.chromium.org/luci/common/tsmon"
"go.chromium.org/luci/common/tsmon/metric"
"go.chromium.org/luci/common/tsmon/types"
tokenserver "go.chromium.org/luci/tokenserver/api"
"go.chromium.org/luci/tokenserver/client"
)
// UpdateOutcome describes overall status of tokend token update process.
type UpdateOutcome string
// Some known outcomes.
//
// See also OutcomeFromRPCError for outcomes generated from status codes.
const (
OutcomeTokenIsGood UpdateOutcome = "TOKEN_IS_GOOD" // token is still valid
OutcomeUpdateSuccess UpdateOutcome = "UPDATE_SUCCESS" // successfully updated
OutcomeCantReadKey UpdateOutcome = "CANT_READ_KEY"
OutcomeMalformedReponse UpdateOutcome = "MALFORMED_RESPONSE"
OutcomeUnknownRPCError UpdateOutcome = "UNKNOWN_RPC_ERROR"
OutcomePermissionError UpdateOutcome = "SAVE_TOKEN_PERM_ERROR"
OutcomeUnknownSaveTokenError UpdateOutcome = "UNKNOWN_SAVE_TOKEN_ERROR"
)
// OutcomeFromRPCError transform MintToken error into an update outcome.
func OutcomeFromRPCError(err error) UpdateOutcome {
if err == nil {
return OutcomeUpdateSuccess
}
if details, ok := err.(client.RPCError); ok {
if details.GrpcCode != codes.OK {
return UpdateOutcome(fmt.Sprintf("GRPC_ERROR_%d", details.GrpcCode))
}
return UpdateOutcome(fmt.Sprintf("MINT_TOKEN_ERROR_%s", details.ErrorCode))
}
return OutcomeUnknownRPCError
}
// UpdateReason describes why tokend attempts to update the token.
type UpdateReason string
// All known reasons for starting token refresh procedure.
const (
UpdateReasonTokenIsGood UpdateReason = "TOKEN_IS_GOOD" // update was skipped
UpdateReasonNewToken UpdateReason = "NEW_TOKEN"
UpdateReasonExpiration UpdateReason = "TOKEN_EXPIRES"
UpdateReasonParametersChange UpdateReason = "PARAMS_CHANGE"
UpdateReasonForceRefresh UpdateReason = "FORCE_REFRESH"
)
// StatusReport gathers information about tokend run.
//
// It is picked up by monitoring harness later.
type StatusReport struct {
Version string // major version of the tokend executable
Started time.Time // when the process started
Finished time.Time // when the process finished
UpdateOutcome UpdateOutcome // overall outcome of the token update process
UpdateReason UpdateReason // why tokend attempts to update the token
FailureError error // immediate error that caused the failure
MintTokenDuration time.Duration // how long RPC call lasted (with all retries)
LastToken *tokenserver.TokenFile // last known token (possibly refreshed)
ServiceVersion string // name and version of the server that generated the token
}
// Report is how status report looks on disk.
type Report struct {
TokendVersion string `json:"tokend_version"`
ServiceVersion string `json:"service_version,omitempty"`
StartedTS int64 `json:"started_ts"`
TotalDuration int64 `json:"total_duration_us,omitempty"`
RPCDuration int64 `json:"rpc_duration_us,omitempty"`
UpdateOutcome string `json:"update_outcome,omitempty"`
UpdateReason string `json:"update_reason,omitempty"`
FailureError string `json:"failure_error,omitempty"`
LogDump string `json:"log_dump"`
TokenLastUpdateTS int64 `json:"token_last_update_ts,omitempty"`
TokenNextUpdateTS int64 `json:"token_next_update_ts,omitempty"`
TokenExpiryTS int64 `json:"token_expiry_ts,omitempty"`
}
// Report gathers the report into single JSON-serializable struct.
func (s *StatusReport) Report() *Report {
rep := &Report{
TokendVersion: s.Version,
ServiceVersion: s.ServiceVersion,
StartedTS: s.Started.Unix(),
TotalDuration: s.Finished.Sub(s.Started).Nanoseconds() / 1000,
RPCDuration: s.MintTokenDuration.Nanoseconds() / 1000,
UpdateOutcome: string(s.UpdateOutcome),
UpdateReason: string(s.UpdateReason),
}
if s.FailureError != nil {
rep.FailureError = s.FailureError.Error()
}
if s.LastToken != nil {
rep.TokenLastUpdateTS = s.LastToken.LastUpdate
rep.TokenNextUpdateTS = s.LastToken.NextUpdate
rep.TokenExpiryTS = s.LastToken.Expiry
}
return rep
}
// SaveToFile saves the status report and log to a file on disk.
func (s *StatusReport) SaveToFile(ctx context.Context, l *memlogger.MemLogger, path string) error {
report := s.Report()
buf := bytes.Buffer{}
l.Dump(&buf)
report.LogDump = buf.String()
blob, err := json.MarshalIndent(report, "", " ")
if err != nil {
return err
}
return AtomicWriteFile(ctx, path, blob, 0644)
}
////////////////////////////////////////////////////////////////////////////////
// All tsmon metrics.
var (
// E.g. "1.0". See Version const in main.go.
metricVersion = metric.NewString(
"luci/machine_tokend/version",
"Major version of luci_machine_tokend executable",
nil)
// E.g. "luci-token-server/2123-abcdef" (<appid>/<version>).
metricServiceVersion = metric.NewString(
"luci/machine_tokend/service_version",
"Identifier of the server version that generated the token",
nil)
// This should be >=30 min in the future if everything is ok. If update
// process fails repeatedly, it will be in the past (and the token is unusable
// at this point).
metricTokenExpiry = metric.NewInt(
"luci/machine_tokend/token_expiry_ts",
"Unix timestamp of when the token expires, in microsec",
&types.MetricMetadata{Units: types.Microseconds})
// This should be no longer than 30 min in the past if everything is ok.
metricTokenLastUpdate = metric.NewInt(
"luci/machine_tokend/last_update_ts",
"Unix timestamp of when the token was successfully updated, in microsec",
&types.MetricMetadata{Units: types.Microseconds})
// This should be [0-30] min in the future if everything ok. If update process
// fails (at least once), it will be in the past. It's not a fatal condition
// yet.
metricTokenNextUpdate = metric.NewInt(
"luci/machine_tokend/next_update_ts",
"Unix timestamp of when the token must be updated next time, in microsec",
&types.MetricMetadata{Units: types.Microseconds})
// See UpdateOutcome enum and OutcomeFromRPCError for possible values.
//
// Positive values are "TOKEN_IS_GOOD" and "UPDATE_SUCCESS".
metricUpdateOutcome = metric.NewString(
"luci/machine_tokend/update_outcome",
"Overall outcome of the luci_machine_tokend invocation",
nil)
// See UpdateReason enum for possible values.
metricUpdateReason = metric.NewString(
"luci/machine_tokend/update_reason",
"Why the token was updated or 'TOKEN_IS_GOOD' if token is still valid",
nil)
metricTotalDuration = metric.NewInt(
"luci/machine_tokend/duration_total_us",
"For how long luci_machine_tokend ran (including all local IO) in microsec",
&types.MetricMetadata{Units: types.Microseconds})
metricRPCDuration = metric.NewInt(
"luci/machine_tokend/duration_rpc_us",
"For how long an RPC to backend ran in microsec",
&types.MetricMetadata{Units: types.Microseconds})
)
// SendMetrics is called at the end of the token update process.
//
// It dumps all relevant metrics to tsmon.
func (s *StatusReport) SendMetrics(ctx context.Context) error {
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
rep := s.Report()
metricVersion.Set(ctx, rep.TokendVersion)
if rep.ServiceVersion != "" {
metricServiceVersion.Set(ctx, rep.ServiceVersion)
}
if rep.TokenExpiryTS != 0 {
metricTokenExpiry.Set(ctx, rep.TokenExpiryTS*1000000)
}
if rep.TokenLastUpdateTS != 0 {
metricTokenLastUpdate.Set(ctx, rep.TokenLastUpdateTS*1000000)
}
if rep.TokenNextUpdateTS != 0 {
metricTokenNextUpdate.Set(ctx, rep.TokenNextUpdateTS*1000000)
}
metricUpdateOutcome.Set(ctx, rep.UpdateOutcome)
metricUpdateReason.Set(ctx, rep.UpdateReason)
metricTotalDuration.Set(ctx, rep.TotalDuration)
metricRPCDuration.Set(ctx, rep.RPCDuration)
return tsmon.Flush(ctx)
}