blob: b56b34310564c7c3ab0b1e1710acbca3d369a506 [file] [log] [blame]
// Copyright 2023 The LUCI Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package model
import (
"context"
"strings"
"time"
"go.chromium.org/luci/auth/identity"
"go.chromium.org/luci/gae/service/datastore"
)
// BotEventType identifies various known bot events.
type BotEventType string
// Bot events that happen outside the scope of a task.
const (
BotEventConnected BotEventType = "bot_connected"
BotEventError BotEventType = "bot_error"
BotEventIdle BotEventType = "bot_idle"
BotEventLog BotEventType = "bot_log"
BotEventMissing BotEventType = "bot_missing"
BotEventPolling BotEventType = "bot_polling"
BotEventRebooting BotEventType = "bot_rebooting"
BotEventShutdown BotEventType = "bot_shutdown"
BotEventTerminate BotEventType = "bot_terminate"
)
// Bot events representing polling outcomes.
const (
BotEventRestart BotEventType = "request_restart"
BotEventSleep BotEventType = "request_sleep"
BotEventTask BotEventType = "request_task"
BotEventUpdate BotEventType = "request_update"
)
// Bot events related to running tasks.
const (
BotEventTaskCompleted BotEventType = "task_completed"
BotEventTaskError BotEventType = "task_error"
BotEventTaskKilled BotEventType = "task_killed"
BotEventTaskUpdate BotEventType = "task_update"
)
// BotRoot is an entity group root of entities representing a single bot.
//
// Presence of this entity indicates there are BotEvent entities for this bot.
//
// TODO(vadimsh): This entity is unnecessary complication. Old entities cleanup
// should happen via Cloud Datastore TTL feature, then this entity is not
// needed.
type BotRoot struct {
// Extra are entity properties that didn't match any declared ones below.
//
// Should normally be empty.
Extra datastore.PropertyMap `gae:"-,extra"`
// Key is derived based on the bot ID, see BotRootKey.
Key *datastore.Key `gae:"$key"`
// LegacyCurrent is no longer used.
LegacyCurrent LegacyProperty `gae:"current"`
}
// BotRootKey is a root key of an entity group with info about a bot.
func BotRootKey(ctx context.Context, botID string) *datastore.Key {
return datastore.NewKey(ctx, "BotRoot", botID, 0, nil)
}
// BotCommon contains properties that are common to both BotInfo and BotEvent.
//
// It is not meant to be stored in the datastore on its own, only as an embedded
// struct inside BotInfo or BotEvent.
type BotCommon struct {
// State is a free form JSON dict with the bot state as reported by the bot.
//
// Swarming itself mostly ignores this information, but it is exposed via API
// and UI, allowing bots to report extended information about themselves to
// Swarming clients.
State []byte `gae:"state,noindex"`
// ExternalIP is the bot's IP address as seen by the server.
ExternalIP string `gae:"external_ip,noindex"`
// AuthenticatedAs is the bot's credentials as seen by the server.
AuthenticatedAs identity.Identity `gae:"authenticated_as,noindex"`
// Version of the bot code the bot is running.
Version string `gae:"version,noindex"`
// Quarantined means the bot is unhealthy and should not receive tasks.
//
// It is set when either:
// - dimensions['quarantined'] or state['quarantined'] is set by the bot.
// - API requests from the bot appear to be malformed.
Quarantined bool `gae:"quarantined,noindex"`
// Maintenance message if the bot is in maintenance.
//
// Maintenance state, just like quarantined state, means the bot should not
// receive tasks. The difference is that maintenance is an expected condition:
// - The bot moves into maintenance state in expected moments.
// - It is expected to be short and end automatically.
Maintenance string `gae:"maintenance_msg,noindex"`
// TaskID is the packed TaskRunResult key of the relevant task, if any.
//
// For BotInfo, it identifies the current TaskRunResult being executed by
// the bot.
//
// For BotEvent, it is relevant for event types `request_task`, `task_killed`,
// `task_completed`, `task_error`.
//
// Note that it is **not** a packed TaskResultSummary. This `task_id` ends in
// `1` instead of `0`.
//
// TODO(vadimsh): This is unfortunate, since this field ends up in BQ exports
// where it causes confusion: task IDs in other BQ exports are "packed
// TaskResultSummary ID", i.e. end in 0. This complicates joining BQ tables.
TaskID string `gae:"task_id,noindex"`
// LastSeenTS is the last time the bot contacted the server, if ever.
//
// Note that it is unindexed to avoid hotspotting the datastore, see
// https://chromium.googlesource.com/infra/luci/luci-py/+/4e9aecba
LastSeen datastore.Optional[time.Time, datastore.Unindexed] `gae:"last_seen_ts"`
// IdleSince is when the bot became idle last time, if ever.
//
// It is unset when running the task or hooks.
IdleSince datastore.Optional[time.Time, datastore.Unindexed] `gae:"idle_since_ts"`
// LegacyProperties is no longer used.
LegacyLeaseID LegacyProperty `gae:"lease_id"`
// LegacyLeaseExpiration is no longer used.
LegacyLeaseExpiration LegacyProperty `gae:"lease_expiration_ts"`
// LegacyLeasedIndefinitely is no longer used.
LegacyLeasedIndefinitely LegacyProperty `gae:"leased_indefinitely"`
// LegacyMachineType is no longer used.
LegacyMachineType LegacyProperty `gae:"machine_type"`
// LegacyMachineLease is no longer used.
LegacyMachineLease LegacyProperty `gae:"machine_lease"`
// LegacyStateJSON is no longer used.
LegacyStateJSON LegacyProperty `gae:"state_json"`
// LegacyDimensions is no longer used.
LegacyDimensions LegacyProperty `gae:"dimensions"`
// LegacyIsBusy is no longer used.
LegacyIsBusy LegacyProperty `gae:"is_busy"`
}
// BotInfo contains the latest information about a bot.
type BotInfo struct {
BotCommon
// Extra are entity properties that didn't match any declared ones below.
//
// Should normally be empty.
Extra datastore.PropertyMap `gae:"-,extra"`
// Key is derived based on the bot ID, see BotInfoKey.
Key *datastore.Key `gae:"$key"`
// Dimensions is a list of dimensions reported by the bot.
//
// Dimensions are used for task selection. They are encoded as a sorted list
// of `key:value` strings. Keep in mind that the same key can be used
// multiple times.
//
// The index is used to filter bots by their dimensions in bot listing API.
Dimensions []string `gae:"dimensions_flat"`
// Composite encodes the current state of the bot.
//
// For datastore performance reasons it encodes multiple aspects of the state
// in a single indexed multi-valued field, resulting in a somewhat weird
// semantics.
//
// The slice always have 4 items, with following meaning:
//
// Composite[0] is one of:
// IN_MAINTENANCE = 1 << 8 # 256
// NOT_IN_MAINTENANCE = 1 << 9 # 512
// Composite[1] is one of:
// DEAD = 1 << 6 # 64
// ALIVE = 1 << 7 # 128
// Composite[2] is one of:
// QUARANTINED = 1 << 2 # 4
// HEALTHY = 1 << 3 # 8
// Composite[3] is one of:
// BUSY = 1 << 0 # 1
// IDLE = 1 << 1 # 2
Composite []int64 `gae:"composite"`
// FirstSeen is when the bot was seen for the first time.
FirstSeen time.Time `gae:"first_seen_ts,noindex"`
// TaskName matches TaskRequest.Name of the task the the bot executes now.
//
// In other words its the title of the task identified by BotCommon.TaskID.
// Empty if the bot is not executing any tasks now.
TaskName string `gae:"task_name,noindex"`
}
// BotInfoKey builds a BotInfo key given the bot ID.
func BotInfoKey(ctx context.Context, botID string) *datastore.Key {
return datastore.NewKey(ctx, "BotInfo", "info", 0, BotRootKey(ctx, botID))
}
// BotInfoQuery prepares a query that fetches BotInfo entities.
func BotInfoQuery() *datastore.Query {
return datastore.NewQuery("BotInfo")
}
// BotID extracts the bot ID from the entity key.
func (b *BotInfo) BotID() string {
return b.Key.Parent().StringID()
}
// IsDead is true if this bot is considered dead.
func (b *BotInfo) IsDead() bool {
// TODO(vadimsh): Check only b.Composite[1].
for _, v := range b.Composite {
switch v {
case 128:
return false
case 64:
return true
}
}
return false
}
// IsInMaintenance is true if this bot is in maintenance.
func (b *BotInfo) IsInMaintenance() bool {
// TODO(vadimsh): Check only b.Composite[0].
for _, v := range b.Composite {
switch v {
case 512:
return false
case 256:
return true
}
}
return false
}
// GetStatus returns the bot status.
func (b *BotInfo) GetStatus() string {
for _, v := range b.Composite {
switch v {
case 256:
return "maintenance"
case 4:
return "quarantined"
case 64:
return "dead"
case 1:
return "running"
}
}
return "ready"
}
// DimenionsByKey returns a list of dimension values with the given key.
func (b *BotInfo) DimenionsByKey(k string) (values []string) {
pfx := k + ":"
for _, kv := range b.Dimensions {
if val, ok := strings.CutPrefix(kv, pfx); ok {
values = append(values, val)
}
}
return values
}
// BotEvent captures information about the bot during some state transition.
//
// Entities of this kind are immutable. They essentially form a log with the
// bot history. Entries are indexed by the timestamp to allow querying this log
// in the chronological order.
type BotEvent struct {
BotCommon
// Extra are entity properties that didn't match any declared ones below.
//
// Should normally be empty.
Extra datastore.PropertyMap `gae:"-,extra"`
// Key identifies the bot and this particular event.
//
// ID is auto-generated by the datastore. The bot is identified via the
// parent key.
Key *datastore.Key `gae:"$key"`
// Timestamp of when this event happened.
//
// The index is used in a bunch of places:
// 1. For ordering events chronologically when listing them.
// 2. Pagination for BQ exports.
// 3. Old event cleanup cron.
Timestamp time.Time `gae:"ts"`
// EventType describes what has happened.
EventType BotEventType `gae:"event_type,noindex"`
// Message is an optional free form message associated with the event.
Message string `gae:"message,noindex"`
// Dimensions is a list of dimensions reported by the bot.
//
// TODO(vadimsh): Stop indexing this after turning down native Swarming
// scheduler. This index is only used in has_capacity(...) implementation,
// which is a part of the native Swarming scheduler and it not used when
// running on top of RBE. This index is pretty big (~6 TB) and getting rid
// of it may also speed up the bot event insertion transaction.
Dimensions []string `gae:"dimensions_flat"`
}
// BotDimensions is a map with bot dimensions as `key => [values]`.
//
// This type represents bot dimensions in the datastore as a JSON-encoded
// unindexed blob. There's an alternative "flat" indexed representation as a
// list of `key:value` pairs. It is used in BotCommon.Dimensions property.
type BotDimensions map[string][]string
// ToProperty stores the value as a JSON-blob property.
func (p *BotDimensions) ToProperty() (datastore.Property, error) {
return ToJSONProperty(p)
}
// FromProperty loads a JSON-blob property.
func (p *BotDimensions) FromProperty(prop datastore.Property) error {
return FromJSONProperty(prop, p)
}