blob: 4a6bac9179748849b449958d2f24254b7c78dd5e [file] [log] [blame]
// Copyright 2022 The LUCI Authors. All rights reserved.
// Use of this source code is governed under the Apache License, Version 2.0
// that can be found in the LICENSE file.
syntax = "proto3";
package swarming.internals.rbe;
option go_package = "go.chromium.org/luci/swarming/proto/internals;internalspb";
import "google/protobuf/duration.proto";
import "google/protobuf/empty.proto";
import "google/protobuf/timestamp.proto";
import "proto/config/pools.proto";
// TaggedMessage is an envelope for an HMAC-tagged protobuf message.
//
// A secret key that was used for tagging should be communicated off band (e.g.
// stored in some pre-agreed Google Secret Manager secret).
//
// hmac_sha256 is calculated as:
// hmac_sha256("%d" % payload_type + "\n" + payload, secret_key).
message TaggedMessage {
enum PayloadType {
PAYLOAD_TYPE_UNSPECIFIED = 0;
POLL_STATE = 1; // payload is wirepb-serialized PollState
BOT_SESSION = 2; // payload is wirepb-serialized BotSession
}
PayloadType payload_type = 1; // identifies the payload proto message type
bytes payload = 2; // the serialized payload proto message
bytes hmac_sha256 = 3; // HMAC of the payload, see the comment above
}
// PollState is produced by the Python server and passed to the Go server.
//
// It travels wrapped in a TaggedMessage. Once the HMAC tag is verified, the Go
// server can trust that this message was generated by the Python server and was
// not tampered with.
//
// It is generated by "/bot/poll" endpoint for bots in the RBE mode. Such bots
// then pass it to the "/bot/rbe/..." endpoints. It contains RBE-related
// parameters for this particular bot produced based on the bot credentials and
// Python server configs.
//
// It also contains instructions how to authenticate the bot on the Go side.
// They are derived based on how the Python server authenticated this particular
// bot. They are tightly coupled to swarming.config.BotAuth message.
//
// Once the Go server verifies the HMAC tag, and checks that the bot presented
// the exact same credentials as used by the Python side, the Go side can trust
// that this message was not tampered with and was not "substituted" (i.e.
// taken from one bot and replayed on another).
message PollState {
// Unique ID of this message (primary to correlate logs that use it).
string id = 1;
// Expiration time of this message. It should not be used once it expires.
google.protobuf.Timestamp expiry = 2;
// Full RBE instance name to use.
string rbe_instance = 3;
// Override these particular dimensions when contacting RBE.
//
// These values will be used instead of whatever the bot is reporting. This is
// used for security-sensitive dimensions like `id` and `pool`.
//
// `id` is always populated and has a single value matching the bot ID.
message Dimension {
string key = 1;
repeated string values = 2;
}
repeated Dimension enforced_dimensions = 4;
// Information for logs and debugging (not passed to RBE).
message DebugInfo {
google.protobuf.Timestamp created = 1; // when this message was created
string swarming_version = 2; // version of the Swarming code
string request_id = 3; // ID of the "/bot/poll" request
}
DebugInfo debug_info = 5;
//////////////////////////////////////////////////////////////////////////////
// Instructions for the Go server how to authenticate the bot.
// If set, use `X-Luci-Gce-Vm-Token` header for auth.
//
// This header should contain JWT with signed VM metadata with the following
// expectations:
// * Audience matches https://[*-dot-]<app>.appspot.com
// * google.compute_engine.project_id field matches `gce_project`.
// * google.compute_engine.instance_name matches `gce_instance`.
message GCEAuth {
string gce_project = 1;
string gce_instance = 2;
}
// If set, use `Authorization` header with OAuth2 access tokens for auth.
//
// The token should have "https://www.googleapis.com/auth/userinfo.email"
// scope and belong to the given service account.
message ServiceAccountAuth {
string service_account = 1;
}
// If set, use `X-Luci-Machine-Token` header with LUCI machine token.
//
// The token should have the corresponding FQDN in it.
message LUCIMachineTokenAuth {
string machine_fqdn = 1;
}
// If set, use only `ip_allowlist` field for auth (see below).
message IPAllowlistAuth {
// No fields.
}
// If set, the bot should be in the corresponding IP allowlist (in addition
// to the primary auth check described by auth_method). Always set if
// auth_method is IPAllowlistAuth.
string ip_allowlist = 10;
// Describes how to authenticate the bot. See swarming.config.BotAuth.
//
// Must be set.
oneof auth_method {
GCEAuth gce_auth = 11;
ServiceAccountAuth service_account_auth = 12;
LUCIMachineTokenAuth luci_machine_token_auth = 13;
IPAllowlistAuth ip_allowlist_auth = 14;
}
}
// BotSession carries an RBE bot session ID and the latest validated PollState.
//
// It travels wrapped in a TaggedMessage. It is produced and verified by the
// Go server whenever the bot calls CreateBotSession or UpdateBotSession.
//
// It serves two purposes:
// 1. Protect the RBE bot session ID from tampering by the bot (e.g. prevents
// the bot from using a different bot session ID of a bot in another pool).
// 2. Preserve parameters of the last known PollState (in particular auth
// ones) and bind the session ID to them, so even if the bot has a valid
// BotSession token from another bot, it won't be able to use them (because
// it will fail the auth check encoded in the PollState).
//
// The original PollState token has limited expiration time and it expires if
// the bot doesn't refresh it by calling the polling endpoint served by the
// Python server. When running a long task, the bot is not polling anything and
// can't refresh the PollState token. But it still periodically calls
// UpdateBotSession to send heartbeats to RBE. This is where PollState stored in
// BotSession token is verified and where BotSession token is occasionally
// refreshed.
//
// If a call to UpdateBotSession has both the PollState token and BotSession
// token (happens when a bot is polling new tasks from RBE), the information in
// the PollState token is used as authoritative since PollState tokens are
// generated by Python code based on the freshest state of bot configs.
// Fields pulled from such PollState token is used to update BotSession token.
message BotSession {
// ID of the RBE's BotSession.
string rbe_bot_session_id = 1;
// Poll state extracted from the last seen validated PollState token.
//
// Its `expiry` should be ignored in favor of `expiry` field in BotSession.
PollState poll_state = 2;
// Expiration time of this message. It should not be used once it expires.
google.protobuf.Timestamp expiry = 3;
}
// EnqueueRBETask describes payload of `rbe-enqueue` TQ tasks.
//
// It is submitted into `rbe-enqueue` Cloud Tasks queue by the Python side and
// processed by the Go side (resulting in a new RBE reservation on success).
message EnqueueRBETask {
// Payload of the new RBE reservation. It will eventually be routed to a bot.
TaskPayload payload = 1;
// Fields below are used to decide how to schedule the reservation. Data in
// them duplicates immutable data already stored in Datastore, but this data
// is potentially hard to get from Go due to use of LocalStructuredProperty so
// it is duplicated here.
// Full RBE instance ID to submit this task to, extracted from TaskRequest.
string rbe_instance = 2;
// When this particular slice expires, extracted from TaskToRunShard.
google.protobuf.Timestamp expiry = 3;
// A bot that should execute this slice (if any), extracted from TaskSlice.
string requested_bot_id = 4;
// Constraints on dimensions reported by a matching bot (ANDed together).
message Constraint {
// The dimension key e.g. "python_version".
string key = 1;
// Allowed dimension values to satisfy the constraint, e.g. ["3.8", "3.9"].
repeated string allowed_values = 2;
}
repeated Constraint constraints = 5;
// Swarming task priority, as submitted by the client.
int32 priority = 6;
// Swarming scheduling algorithm, as specified in pools.cfg.
swarming.config.Pool.SchedulingAlgorithm scheduling_algorithm = 7;
// How long the task is allowed to run once it starts on the bot.
google.protobuf.Duration execution_timeout = 8;
}
// CancelRBETask describes payload of `rbe-cancel` TQ tasks.
//
// It is submitted into `rbe-cancel` Cloud Tasks queue by the Python side and
// processed by the Go side (resulting in cancellation of an RBE reservation).
message CancelRBETask {
// Full RBE instance ID with the reservation, extracted from TaskRequest.
string rbe_instance = 1;
// Reservation to cancel (scoped to the instance).
string reservation_id = 2;
// Optional information used for debugging and tracing purposes.
message DebugInfo {
google.protobuf.Timestamp created = 1; // when this message was created
string py_swarming_version = 2; // version of the Python Swarming
string task_name = 3; // the user-supplied task name FYI
}
DebugInfo debug_info = 3;
}
// TaskPayload is used as an RBE task payload.
//
// It is serialized as anypb.Any when passed to RBE, and its full proto name
// is thus sensitive.
//
// It points to an existing TaskToRunShardXXX entity representing the pending
// request to execute a single task slice plus some extra information useful
// for debugging.
//
// It also contains the name of the RBE reservation that will be created to
// represent this task.
message TaskPayload {
// Unique (within the RBE instance) ID of the reservation, for idempotency.
string reservation_id = 1;
// Swarming task ID (aka TaskResultSummary packed id), identifies TaskRequest.
string task_id = 2;
// Task slice index (mostly FYI).
int32 slice_index = 3;
// Shard index of TaskToRunShardXXX entity class.
int32 task_to_run_shard = 4;
// Datastore ID of TaskToRunShardXXX entity (a child of the TaskRequest).
int64 task_to_run_id = 5;
// Optional information used for debugging and tracing purposes.
message DebugInfo {
google.protobuf.Timestamp created = 1; // when this message was created
string py_swarming_version = 2; // version of the Python Swarming
string go_swarming_version = 3; // version of the Go Swarming
string task_name = 4; // the user-supplied task name FYI
}
DebugInfo debug_info = 6;
// If true, the bot should not contact Python Swarming, don't execute
// anything, just immediately move the reservation into COMPLETED state.
//
// This is useful during initial development to test RBE task distribution
// mechanism in isolation from other Swarming guts.
bool noop = 7;
}
// TaskResult is used as an RBE task result.
//
// TaskResult represents an outcome of a reservation that was processed by a bot
// (successfully or not). If a bot never saw the reservation, or crashed midway,
// TaskResult is not available. There's more generic Reservation.status field
// for these cases in the RBE API.
//
// TaskResult is serialized into anypb.Any when passed to RBE, and its full
// proto name is thus sensitive.
//
// Note that the corresponding TaskPayload is available in the same RBE
// Reservation proto that contains TaskResult, so TaskPayload fields are not
// duplicated in the TaskResult.
message TaskResult {
// Set to a human readable string if the bot legitimately skipped executing
// the reservation e.g. because it was already claimed. Used for debugging
// only.
string skip_reason = 1;
// Set if the bot picked up the reservation, but could not work on it and
// gave up. This usually happens if the bot can't claim the TaskToRun after
// many attempts. This is an internal Swarming error and it results in the
// task failing with BOT_DIED error.
string bot_internal_error = 2;
}
// This service is exposed by the Python Swarming, called by the Go Swarming.
//
// All RPCs are internal to the Swarming backend.
service Internals {
// Marks the slice as expired or failed, switches the task to the next slice.
//
// Does nothing (and succeeds) if the slice is no longer pending or doesn't
// exist.
rpc ExpireSlice(ExpireSliceRequest) returns (google.protobuf.Empty);
}
// Body of ExpireSlice internal RPC call.
//
// It identifies a concrete TaskToRunShardXXX entity and the reason it has
// expired.
message ExpireSliceRequest {
// Swarming task ID (aka TaskResultSummary packed id), identifies TaskRequest.
string task_id = 1;
// Shard index of TaskToRunShardXXX entity class.
int32 task_to_run_shard = 2;
// Datastore ID of TaskToRunShardXXX entity (a child of the TaskRequest).
int64 task_to_run_id = 3;
// The reason the slice is marked as expired.
enum Reason {
REASON_UNSPECIFIED = 0;
NO_RESOURCE = 1; // no bots alive that match the requested dimensions
PERMISSION_DENIED = 2; // no access to the RBE instance
INVALID_ARGUMENT = 3; // RBE didn't like something about the reservation
BOT_INTERNAL_ERROR = 4; // the bot picked up the reservation and then died
EXPIRED = 5; // the scheduling deadline exceeded
}
Reason reason = 4;
string details = 5;
}