tokenserver/cmd/luci_machine_tokend/status.go - infra/luci/luci-go - Git at Google

 // Copyright 2016 The LUCI Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package main

 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
 	"time"

 	"google.golang.org/grpc/codes"

 	"go.chromium.org/luci/common/logging/memlogger"
 	"go.chromium.org/luci/common/tsmon"
 	"go.chromium.org/luci/common/tsmon/metric"
 	"go.chromium.org/luci/common/tsmon/types"

 	tokenserver "go.chromium.org/luci/tokenserver/api"
 	"go.chromium.org/luci/tokenserver/client"
 )

 // UpdateOutcome describes overall status of tokend token update process.
 type UpdateOutcome string

 // Some known outcomes.
 //
 // See also OutcomeFromRPCError for outcomes generated from status codes.
 const (
 	OutcomeTokenIsGood           UpdateOutcome = "TOKEN_IS_GOOD"  // token is still valid
 	OutcomeUpdateSuccess         UpdateOutcome = "UPDATE_SUCCESS" // successfully updated
 	OutcomeCantReadKey           UpdateOutcome = "CANT_READ_KEY"
 	OutcomeMalformedReponse      UpdateOutcome = "MALFORMED_RESPONSE"
 	OutcomeUnknownRPCError       UpdateOutcome = "UNKNOWN_RPC_ERROR"
 	OutcomePermissionError       UpdateOutcome = "SAVE_TOKEN_PERM_ERROR"
 	OutcomeUnknownSaveTokenError UpdateOutcome = "UNKNOWN_SAVE_TOKEN_ERROR"
 )

 // OutcomeFromRPCError transform MintToken error into an update outcome.
 func OutcomeFromRPCError(err error) UpdateOutcome {
 	if err == nil {
 		return OutcomeUpdateSuccess
 	}
 	if details, ok := err.(client.RPCError); ok {
 		if details.GrpcCode != codes.OK {
 			return UpdateOutcome(fmt.Sprintf("GRPC_ERROR_%d", details.GrpcCode))
 		}
 		return UpdateOutcome(fmt.Sprintf("MINT_TOKEN_ERROR_%s", details.ErrorCode))
 	}
 	return OutcomeUnknownRPCError
 }

 // UpdateReason describes why tokend attempts to update the token.
 type UpdateReason string

 // All known reasons for starting token refresh procedure.
 const (
 	UpdateReasonTokenIsGood      UpdateReason = "TOKEN_IS_GOOD" // update was skipped
 	UpdateReasonNewToken         UpdateReason = "NEW_TOKEN"
 	UpdateReasonExpiration       UpdateReason = "TOKEN_EXPIRES"
 	UpdateReasonParametersChange UpdateReason = "PARAMS_CHANGE"
 	UpdateReasonForceRefresh     UpdateReason = "FORCE_REFRESH"
 )

 // StatusReport gathers information about tokend run.
 //
 // It is picked up by monitoring harness later.
 type StatusReport struct {
 	Version           string                 // major version of the tokend executable
 	Started           time.Time              // when the process started
 	Finished          time.Time              // when the process finished
 	UpdateOutcome     UpdateOutcome          // overall outcome of the token update process
 	UpdateReason      UpdateReason           // why tokend attempts to update the token
 	FailureError      error                  // immediate error that caused the failure
 	MintTokenDuration time.Duration          // how long RPC call lasted (with all retries)
 	LastToken         *tokenserver.TokenFile // last known token (possibly refreshed)
 	ServiceVersion    string                 // name and version of the server that generated the token
 }

 // Report is how status report looks on disk.
 type Report struct {
 	TokendVersion     string `json:"tokend_version"`
 	ServiceVersion    string `json:"service_version,omitempty"`
 	StartedTS         int64  `json:"started_ts"`
 	TotalDuration     int64  `json:"total_duration_us,omitempty"`
 	RPCDuration       int64  `json:"rpc_duration_us,omitempty"`
 	UpdateOutcome     string `json:"update_outcome,omitempty"`
 	UpdateReason      string `json:"update_reason,omitempty"`
 	FailureError      string `json:"failure_error,omitempty"`
 	LogDump           string `json:"log_dump"`
 	TokenLastUpdateTS int64  `json:"token_last_update_ts,omitempty"`
 	TokenNextUpdateTS int64  `json:"token_next_update_ts,omitempty"`
 	TokenExpiryTS     int64  `json:"token_expiry_ts,omitempty"`
 }

 // Report gathers the report into single JSON-serializable struct.
 func (s *StatusReport) Report() *Report {
 	rep := &Report{
 		TokendVersion:  s.Version,
 		ServiceVersion: s.ServiceVersion,
 		StartedTS:      s.Started.Unix(),
 		TotalDuration:  s.Finished.Sub(s.Started).Nanoseconds() / 1000,
 		RPCDuration:    s.MintTokenDuration.Nanoseconds() / 1000,
 		UpdateOutcome:  string(s.UpdateOutcome),
 		UpdateReason:   string(s.UpdateReason),
 	}
 	if s.FailureError != nil {
 		rep.FailureError = s.FailureError.Error()
 	}
 	if s.LastToken != nil {
 		rep.TokenLastUpdateTS = s.LastToken.LastUpdate
 		rep.TokenNextUpdateTS = s.LastToken.NextUpdate
 		rep.TokenExpiryTS = s.LastToken.Expiry
 	}
 	return rep
 }

 // SaveToFile saves the status report and log to a file on disk.
 func (s *StatusReport) SaveToFile(ctx context.Context, l *memlogger.MemLogger, path string) error {
 	report := s.Report()

 	buf := bytes.Buffer{}
 	l.Dump(&buf)
 	report.LogDump = buf.String()

 	blob, err := json.MarshalIndent(report, "", "  ")
 	if err != nil {
 		return err
 	}
 	return AtomicWriteFile(ctx, path, blob, 0644)
 }

 ////////////////////////////////////////////////////////////////////////////////
 // All tsmon metrics.

 var (
 	// E.g. "1.0". See Version const in main.go.
 	metricVersion = metric.NewString(
 		"luci/machine_tokend/version",
 		"Major version of luci_machine_tokend executable",
 		nil)

 	// E.g. "luci-token-server/2123-abcdef" (<appid>/<version>).
 	metricServiceVersion = metric.NewString(
 		"luci/machine_tokend/service_version",
 		"Identifier of the server version that generated the token",
 		nil)

 	// This should be >=30 min in the future if everything is ok. If update
 	// process fails repeatedly, it will be in the past (and the token is unusable
 	// at this point).
 	metricTokenExpiry = metric.NewInt(
 		"luci/machine_tokend/token_expiry_ts",
 		"Unix timestamp of when the token expires, in microsec",
 		&types.MetricMetadata{Units: types.Microseconds})

 	// This should be no longer than 30 min in the past if everything is ok.
 	metricTokenLastUpdate = metric.NewInt(
 		"luci/machine_tokend/last_update_ts",
 		"Unix timestamp of when the token was successfully updated, in microsec",
 		&types.MetricMetadata{Units: types.Microseconds})

 	// This should be [0-30] min in the future if everything ok. If update process
 	// fails (at least once), it will be in the past. It's not a fatal condition
 	// yet.
 	metricTokenNextUpdate = metric.NewInt(
 		"luci/machine_tokend/next_update_ts",
 		"Unix timestamp of when the token must be updated next time, in microsec",
 		&types.MetricMetadata{Units: types.Microseconds})

 	// See UpdateOutcome enum and OutcomeFromRPCError for possible values.
 	//
 	// Positive values are "TOKEN_IS_GOOD" and "UPDATE_SUCCESS".
 	metricUpdateOutcome = metric.NewString(
 		"luci/machine_tokend/update_outcome",
 		"Overall outcome of the luci_machine_tokend invocation",
 		nil)

 	// See UpdateReason enum for possible values.
 	metricUpdateReason = metric.NewString(
 		"luci/machine_tokend/update_reason",
 		"Why the token was updated or 'TOKEN_IS_GOOD' if token is still valid",
 		nil)

 	metricTotalDuration = metric.NewInt(
 		"luci/machine_tokend/duration_total_us",
 		"For how long luci_machine_tokend ran (including all local IO) in microsec",
 		&types.MetricMetadata{Units: types.Microseconds})

 	metricRPCDuration = metric.NewInt(
 		"luci/machine_tokend/duration_rpc_us",
 		"For how long an RPC to backend ran in microsec",
 		&types.MetricMetadata{Units: types.Microseconds})
 )

 // SendMetrics is called at the end of the token update process.
 //
 // It dumps all relevant metrics to tsmon.
 func (s *StatusReport) SendMetrics(ctx context.Context) error {
 	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
 	defer cancel()
 	rep := s.Report()

 	metricVersion.Set(ctx, rep.TokendVersion)
 	if rep.ServiceVersion != "" {
 		metricServiceVersion.Set(ctx, rep.ServiceVersion)
 	}
 	if rep.TokenExpiryTS != 0 {
 		metricTokenExpiry.Set(ctx, rep.TokenExpiryTS*1000000)
 	}
 	if rep.TokenLastUpdateTS != 0 {
 		metricTokenLastUpdate.Set(ctx, rep.TokenLastUpdateTS*1000000)
 	}
 	if rep.TokenNextUpdateTS != 0 {
 		metricTokenNextUpdate.Set(ctx, rep.TokenNextUpdateTS*1000000)
 	}
 	metricUpdateOutcome.Set(ctx, rep.UpdateOutcome)
 	metricUpdateReason.Set(ctx, rep.UpdateReason)
 	metricTotalDuration.Set(ctx, rep.TotalDuration)
 	metricRPCDuration.Set(ctx, rep.RPCDuration)

 	return tsmon.Flush(ctx)
 }
	// Copyright 2016 The LUCI Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package main

	import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"time"

	"google.golang.org/grpc/codes"

	"go.chromium.org/luci/common/logging/memlogger"
	"go.chromium.org/luci/common/tsmon"
	"go.chromium.org/luci/common/tsmon/metric"
	"go.chromium.org/luci/common/tsmon/types"

	tokenserver "go.chromium.org/luci/tokenserver/api"
	"go.chromium.org/luci/tokenserver/client"
	)

	// UpdateOutcome describes overall status of tokend token update process.
	type UpdateOutcome string

	// Some known outcomes.
	//
	// See also OutcomeFromRPCError for outcomes generated from status codes.
	const (
	OutcomeTokenIsGood UpdateOutcome = "TOKEN_IS_GOOD" // token is still valid
	OutcomeUpdateSuccess UpdateOutcome = "UPDATE_SUCCESS" // successfully updated
	OutcomeCantReadKey UpdateOutcome = "CANT_READ_KEY"
	OutcomeMalformedReponse UpdateOutcome = "MALFORMED_RESPONSE"
	OutcomeUnknownRPCError UpdateOutcome = "UNKNOWN_RPC_ERROR"
	OutcomePermissionError UpdateOutcome = "SAVE_TOKEN_PERM_ERROR"
	OutcomeUnknownSaveTokenError UpdateOutcome = "UNKNOWN_SAVE_TOKEN_ERROR"
	)

	// OutcomeFromRPCError transform MintToken error into an update outcome.
	func OutcomeFromRPCError(err error) UpdateOutcome {
	if err == nil {
	return OutcomeUpdateSuccess
	}
	if details, ok := err.(client.RPCError); ok {
	if details.GrpcCode != codes.OK {
	return UpdateOutcome(fmt.Sprintf("GRPC_ERROR_%d", details.GrpcCode))
	}
	return UpdateOutcome(fmt.Sprintf("MINT_TOKEN_ERROR_%s", details.ErrorCode))
	}
	return OutcomeUnknownRPCError
	}

	// UpdateReason describes why tokend attempts to update the token.
	type UpdateReason string

	// All known reasons for starting token refresh procedure.
	const (
	UpdateReasonTokenIsGood UpdateReason = "TOKEN_IS_GOOD" // update was skipped
	UpdateReasonNewToken UpdateReason = "NEW_TOKEN"
	UpdateReasonExpiration UpdateReason = "TOKEN_EXPIRES"
	UpdateReasonParametersChange UpdateReason = "PARAMS_CHANGE"
	UpdateReasonForceRefresh UpdateReason = "FORCE_REFRESH"
	)

	// StatusReport gathers information about tokend run.
	//
	// It is picked up by monitoring harness later.
	type StatusReport struct {
	Version string // major version of the tokend executable
	Started time.Time // when the process started
	Finished time.Time // when the process finished
	UpdateOutcome UpdateOutcome // overall outcome of the token update process
	UpdateReason UpdateReason // why tokend attempts to update the token
	FailureError error // immediate error that caused the failure
	MintTokenDuration time.Duration // how long RPC call lasted (with all retries)
	LastToken *tokenserver.TokenFile // last known token (possibly refreshed)
	ServiceVersion string // name and version of the server that generated the token
	}

	// Report is how status report looks on disk.
	type Report struct {
	TokendVersion string `json:"tokend_version"`
	ServiceVersion string `json:"service_version,omitempty"`
	StartedTS int64 `json:"started_ts"`
	TotalDuration int64 `json:"total_duration_us,omitempty"`
	RPCDuration int64 `json:"rpc_duration_us,omitempty"`
	UpdateOutcome string `json:"update_outcome,omitempty"`
	UpdateReason string `json:"update_reason,omitempty"`
	FailureError string `json:"failure_error,omitempty"`
	LogDump string `json:"log_dump"`
	TokenLastUpdateTS int64 `json:"token_last_update_ts,omitempty"`
	TokenNextUpdateTS int64 `json:"token_next_update_ts,omitempty"`
	TokenExpiryTS int64 `json:"token_expiry_ts,omitempty"`
	}

	// Report gathers the report into single JSON-serializable struct.
	func (s StatusReport) Report() Report {
	rep := &Report{
	TokendVersion: s.Version,
	ServiceVersion: s.ServiceVersion,
	StartedTS: s.Started.Unix(),
	TotalDuration: s.Finished.Sub(s.Started).Nanoseconds() / 1000,
	RPCDuration: s.MintTokenDuration.Nanoseconds() / 1000,
	UpdateOutcome: string(s.UpdateOutcome),
	UpdateReason: string(s.UpdateReason),
	}
	if s.FailureError != nil {
	rep.FailureError = s.FailureError.Error()
	}
	if s.LastToken != nil {
	rep.TokenLastUpdateTS = s.LastToken.LastUpdate
	rep.TokenNextUpdateTS = s.LastToken.NextUpdate
	rep.TokenExpiryTS = s.LastToken.Expiry
	}
	return rep
	}

	// SaveToFile saves the status report and log to a file on disk.
	func (s StatusReport) SaveToFile(ctx context.Context, l memlogger.MemLogger, path string) error {
	report := s.Report()

	buf := bytes.Buffer{}
	l.Dump(&buf)
	report.LogDump = buf.String()

	blob, err := json.MarshalIndent(report, "", " ")
	if err != nil {
	return err
	}
	return AtomicWriteFile(ctx, path, blob, 0644)
	}

	////////////////////////////////////////////////////////////////////////////////
	// All tsmon metrics.

	var (
	// E.g. "1.0". See Version const in main.go.
	metricVersion = metric.NewString(
	"luci/machine_tokend/version",
	"Major version of luci_machine_tokend executable",
	nil)

	// E.g. "luci-token-server/2123-abcdef" (<appid>/<version>).
	metricServiceVersion = metric.NewString(
	"luci/machine_tokend/service_version",
	"Identifier of the server version that generated the token",
	nil)

	// This should be >=30 min in the future if everything is ok. If update
	// process fails repeatedly, it will be in the past (and the token is unusable
	// at this point).
	metricTokenExpiry = metric.NewInt(
	"luci/machine_tokend/token_expiry_ts",
	"Unix timestamp of when the token expires, in microsec",
	&types.MetricMetadata{Units: types.Microseconds})

	// This should be no longer than 30 min in the past if everything is ok.
	metricTokenLastUpdate = metric.NewInt(
	"luci/machine_tokend/last_update_ts",
	"Unix timestamp of when the token was successfully updated, in microsec",
	&types.MetricMetadata{Units: types.Microseconds})

	// This should be [0-30] min in the future if everything ok. If update process
	// fails (at least once), it will be in the past. It's not a fatal condition
	// yet.
	metricTokenNextUpdate = metric.NewInt(
	"luci/machine_tokend/next_update_ts",
	"Unix timestamp of when the token must be updated next time, in microsec",
	&types.MetricMetadata{Units: types.Microseconds})

	// See UpdateOutcome enum and OutcomeFromRPCError for possible values.
	//
	// Positive values are "TOKEN_IS_GOOD" and "UPDATE_SUCCESS".
	metricUpdateOutcome = metric.NewString(
	"luci/machine_tokend/update_outcome",
	"Overall outcome of the luci_machine_tokend invocation",
	nil)

	// See UpdateReason enum for possible values.
	metricUpdateReason = metric.NewString(
	"luci/machine_tokend/update_reason",
	"Why the token was updated or 'TOKEN_IS_GOOD' if token is still valid",
	nil)

	metricTotalDuration = metric.NewInt(
	"luci/machine_tokend/duration_total_us",
	"For how long luci_machine_tokend ran (including all local IO) in microsec",
	&types.MetricMetadata{Units: types.Microseconds})

	metricRPCDuration = metric.NewInt(
	"luci/machine_tokend/duration_rpc_us",
	"For how long an RPC to backend ran in microsec",
	&types.MetricMetadata{Units: types.Microseconds})
	)

	// SendMetrics is called at the end of the token update process.
	//
	// It dumps all relevant metrics to tsmon.
	func (s *StatusReport) SendMetrics(ctx context.Context) error {
	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
	defer cancel()
	rep := s.Report()

	metricVersion.Set(ctx, rep.TokendVersion)
	if rep.ServiceVersion != "" {
	metricServiceVersion.Set(ctx, rep.ServiceVersion)
	}
	if rep.TokenExpiryTS != 0 {
	metricTokenExpiry.Set(ctx, rep.TokenExpiryTS*1000000)
	}
	if rep.TokenLastUpdateTS != 0 {
	metricTokenLastUpdate.Set(ctx, rep.TokenLastUpdateTS*1000000)
	}
	if rep.TokenNextUpdateTS != 0 {
	metricTokenNextUpdate.Set(ctx, rep.TokenNextUpdateTS*1000000)
	}
	metricUpdateOutcome.Set(ctx, rep.UpdateOutcome)
	metricUpdateReason.Set(ctx, rep.UpdateReason)
	metricTotalDuration.Set(ctx, rep.TotalDuration)
	metricRPCDuration.Set(ctx, rep.RPCDuration)

	return tsmon.Flush(ctx)
	}