blob: ca4bfb680dc84a6e1e280c2b18547ca7a51230cc [file] [log] [blame]
// Copyright 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package tasks
import (
"context"
"fmt"
"infra/cmd/skylab/internal/bb"
"time"
"github.com/maruel/subcommands"
"go.chromium.org/luci/auth/client/authcli"
buildbucket_pb "go.chromium.org/luci/buildbucket/proto"
"go.chromium.org/luci/common/cli"
"go.chromium.org/luci/common/errors"
skycmdlib "infra/cmd/skylab/internal/cmd/cmdlib"
"infra/cmd/skylab/internal/flagx"
"infra/cmd/skylab/internal/site"
"infra/cmd/skylab/internal/userinput"
"infra/cmdsupport/cmdlib"
"infra/libs/skylab/swarming"
)
const (
dayInMinutes = 24 * 60
// maxTasksPerModel is the maximum number of tasks that are allowed to be executing
// at the same time for a given model.
maxTasksPerModel = 1
// maxTasksPerBoard is the maximum number of tasks that are allowed to be executing
// at the same time for a given board. It is a completely independent cap from
// maxTasksPerModel. A board lease does not count towards the model cap and vice versa.
maxTasksPerBoard = 0
boardLease primaryLeaseDimensionType = "board"
modelLease = "model"
hostnameLease = "hostname"
)
// LeaseDut subcommand: Lease a DUT for debugging.
var LeaseDut = &subcommands.Command{
UsageLine: "lease-dut HOST\n\tskylab lease-dut -model MODEL",
ShortDesc: "lease DUT for debugging [DEPRECATED--please use crosfleet (go/crosfleet-cli)]",
LongDesc: `[DEPRECATED--please use crosfleet (go/crosfleet-cli)]
Lease DUT for debugging.
This subcommand's behavior is subject to change without notice.
Do not build automation around this subcommand.`,
CommandRun: func() subcommands.CommandRun {
c := &leaseDutRun{}
c.authFlags.Register(&c.Flags, site.DefaultAuthOptions)
c.envFlags.Register(&c.Flags)
// Use a float so that large values passed on the command line are NOT wrapped.
c.Flags.Float64Var(&c.leaseMinutes, "minutes", 60, "Duration of lease.")
c.Flags.StringVar(&c.leaseReason, "reason", "", "The reason to perform this lease, it must match crbug.com/NNNN or b/NNNN.")
// TODO(gregorynisbet):
// If a model is provided, then we necessarily target DUT_POOL_QUOTA and only
// repair-failed DUTs until a better policy can be implemented.
c.Flags.StringVar(&c.model, "model", "", "Leases may optionally target a model instead of a hostname.")
c.Flags.StringVar(&c.board, "board", "", "Leases may optionally target a board instead of a hostname.")
// We allow arbitrary dimensions to be passed in via the -dim and/or -dims flags.
// For maximum flexibility, both flags can take one or more key:val or key=val
// pairs (separated by ","), and repeated/mixed flags are allowed. To keep the
// docs intuitive, docstrings describe the more natural arg format for each flag.
c.Flags.Var(flagx.Dims(&c.dims), "dim", "Single additional dimension in format key=value or key:value; may be specified multiple times.")
c.Flags.Var(flagx.Dims(&c.dims), "dims", "List of additional dimensions in format key1=value1,key2=value2,... or key1:value1,key2:value2,... .")
c.Flags.BoolVar(&c.evilLease, "evil-lease", false, "Evil lease allows the user to bypass the lease per model limit if there's an emergency.")
return c
},
}
type primaryLeaseDimensionType string
type leaseDutRun struct {
subcommands.CommandRunBase
authFlags authcli.Flags
envFlags skycmdlib.EnvFlags
leaseMinutes float64
leaseReason string
model string
board string
dims map[string]string
evilLease bool
}
func (c *leaseDutRun) Run(a subcommands.Application, args []string, env subcommands.Env) int {
if err := c.innerRun(a, args, env); err != nil {
cmdlib.PrintError(a, err)
return 1
}
return 0
}
func (c *leaseDutRun) innerRun(a subcommands.Application, args []string, env subcommands.Env) error {
hasOneHostname := len(args) == 1
hasModel := c.model != ""
hasBoard := c.board != ""
if !exactlyOne(hasOneHostname, hasModel, hasBoard) {
return cmdlib.NewUsageError(c.Flags, "exactly one hostname or model or board required.")
}
if c.leaseMinutes < 0 {
return cmdlib.NewUsageError(c.Flags, fmt.Sprintf("minutes to lease (%d) cannot be negative", int64(c.leaseMinutes)))
}
if c.leaseMinutes >= dayInMinutes {
return cmdlib.NewUsageError(c.Flags, "Lease duration (%d minutes) cannot exceed 1 day [%d minutes]", int64(c.leaseMinutes), dayInMinutes)
}
if len(c.leaseReason) > 30 {
return cmdlib.NewUsageError(c.Flags, "the lease reason is limited in 30 characters")
}
if userinput.ValidBug(c.leaseReason) {
return cmdlib.NewUsageError(c.Flags, "the lease reason must match crbug.com/NNNN or b/NNNN")
}
ctx := cli.GetContext(a, c, env)
leaseDuration := time.Duration(c.leaseMinutes) * time.Minute
sc, err := c.newSwarmingClient(ctx)
if err != nil {
return err
}
bc, err := bb.NewClient(ctx, c.envFlags.Env().DUTLeaserBuilderInfo, c.authFlags)
if err != nil {
return err
}
var taskID int64
switch {
case hasOneHostname:
oldhost := args[0]
host := skycmdlib.FixSuspiciousHostname(oldhost)
if host != oldhost {
fmt.Fprintf(a.GetErr(), "correcting (%s) to (%s)\n", oldhost, host)
}
taskID, err = c.leaseDutByHostname(ctx, a, sc, bc, leaseDuration, host)
case hasBoard:
taskID, err = c.leaseDUTByBoard(ctx, a, sc, bc, leaseDuration)
default:
taskID, err = c.leaseDUTByModel(ctx, a, sc, bc, leaseDuration)
}
if err != nil {
return err
}
dutName, err := c.waitForBuildStart(ctx, bc, taskID)
if err != nil {
return err
}
fqdn := dutNameToFQDN(dutName)
fmt.Fprintf(a.GetOut(), "%s\n", fqdn)
// TODO(ayatane): The time printed here may be off by the poll interval above.
fmt.Fprintf(a.GetOut(), "DUT leased until %s\n", time.Now().Add(leaseDuration).Format(time.RFC1123))
return nil
}
// leaseDutByHostname leases a DUT by hostname and schedules a follow-up repair task
func (c *leaseDutRun) leaseDutByHostname(ctx context.Context, a subcommands.Application, sc *swarming.Client, bc *bb.Client, leaseDuration time.Duration, host string) (taskID int64, err error) {
ic, err := getUFSClient(ctx, &c.authFlags, c.envFlags.Env())
if err != nil {
return 0, err
}
// This is done to allow the lease task to be tagged with the model.
// This allows to rate-limit the number of leases for a given model M by checking for leases with the label-model:M tag.
model, err := getModelForHost(ctx, ic, host)
if err != nil {
return 0, err
}
fmt.Fprintf(a.GetErr(), "inferred model (%s)\n", model)
var extraTags []string
if model != "" {
extraTags = append(extraTags, fmt.Sprintf("label-model:%s", model))
}
dims, tags, err := c.fullBBDimensionsAndTags(ctx, sc, hostnameLease, host, extraTags...)
if err != nil {
return 0, err
}
id, err := bc.ScheduleDUTLeaserBuild(ctx, dims, tags, int32(leaseDuration.Minutes()))
if err != nil {
return 0, err
}
fmt.Fprintf(a.GetOut(), "Created lease for host %s: %s\n", host, bc.BuildURL(id))
fmt.Fprintf(a.GetOut(), "Waiting for task to start; lease isn't active yet\n")
return id, nil
}
// leaseDutByModel leases a DUT by model. Any healthy DUT in the given model may be chosen by the task.
func (c *leaseDutRun) leaseDUTByModel(ctx context.Context, a subcommands.Application, sc *swarming.Client, bc *bb.Client, leaseDuration time.Duration) (taskID int64, err error) {
tasks, err := sc.GetActiveLeaseTasksForModel(ctx, c.model)
if err != nil {
return 0, errors.Annotate(err, "computing existing leases").Err()
}
if maxTasksPerModel <= 0 {
return 0, errors.Reason("Leases by model are disabled").Err()
}
if !c.evilLease && len(tasks) > maxTasksPerModel {
return 0, fmt.Errorf("number of active tasks %d for model %q exceeds global limit for all users %d", len(tasks), c.model, maxTasksPerModel)
}
dims, tags, err := c.fullBBDimensionsAndTags(ctx, sc, modelLease, c.model)
if err != nil {
return 0, errors.Annotate(err, "generating dimensions and tags for board").Err()
}
id, err := bc.ScheduleDUTLeaserBuild(ctx, dims, tags, int32(leaseDuration.Minutes()))
if err != nil {
return 0, err
}
fmt.Fprintf(a.GetOut(), "Created lease for model %s: %s\n", c.model, bc.BuildURL(id))
return id, nil
}
// leaseDUTbyBoard leases a DUT by board.
func (c *leaseDutRun) leaseDUTByBoard(ctx context.Context, a subcommands.Application, sc *swarming.Client, bc *bb.Client, leaseDuration time.Duration) (taskID int64, err error) {
tasks, err := sc.GetActiveLeaseTasksForBoard(ctx, c.board)
if err != nil {
return 0, errors.Annotate(err, "computing existing lease for board").Err()
}
if maxTasksPerBoard <= 0 {
return 0, errors.Reason("Leases by board are disabled").Err()
}
if len(tasks) > maxTasksPerBoard {
return 0, errors.Reason("number of active tasks %d for board %q exceeds global limit for all users %d", len(tasks), c.board, maxTasksPerBoard).Err()
}
dims, tags, err := c.fullBBDimensionsAndTags(ctx, sc, boardLease, c.board)
if err != nil {
return 0, errors.Annotate(err, "generating dimensions and tags for board").Err()
}
id, err := bc.ScheduleDUTLeaserBuild(ctx, dims, tags, int32(leaseDuration.Minutes()))
if err != nil {
return 0, err
}
fmt.Fprintf(a.GetOut(), "Created lease for board %s: %s\n", c.board, bc.BuildURL(id))
return id, nil
}
// bbDimensionsAndTags creates the full Buildbucket dimensions and tags used to lease a DUT of
// the specified board/model/hostname dimension.
func (c *leaseDutRun) fullBBDimensionsAndTags(ctx context.Context, sc *swarming.Client, primaryDimType primaryLeaseDimensionType, primaryDim string, extraTags ...string) (map[string]string, []string, error) {
fullDims := make(map[string]string)
tags := append(
extraTags,
"qs_account:leases",
fmt.Sprintf("lease-by:%s", primaryDimType),
fmt.Sprintf("lease-reason:%s", c.leaseReason),
"skylab-tool:lease",
)
if primaryDimType == hostnameLease {
botID, err := sc.DutNameToBotID(ctx, primaryDim)
if err != nil {
return nil, nil, err
}
fullDims["id"] = botID
tags = append(tags, fmt.Sprintf("dut-name:%s", primaryDim))
} else {
fullDims[fmt.Sprintf("label-%s", primaryDimType)] = primaryDim
fullDims["label-pool"] = "DUT_POOL_QUOTA"
fullDims["dut_state"] = "ready"
tags = append(tags, fmt.Sprintf("%s:%s", primaryDimType, primaryDim))
}
// Any arbitrary dimensions passed from the command line should be included in both
// BB dimensions and tags.
fullDims = mergeDims(&fullDims, &c.dims)
tags = append(tags, dimsToTags(&c.dims)...)
return fullDims, tags, nil
}
// newSwarmingClient creates a new swarming client.
func (c *leaseDutRun) newSwarmingClient(ctx context.Context) (*swarming.Client, error) {
h, err := cmdlib.NewHTTPClient(ctx, &c.authFlags)
if err != nil {
return nil, err
}
e := c.envFlags.Env()
client, err := swarming.NewClient(h, e.SwarmingService)
if err != nil {
return nil, err
}
return client, nil
}
// waitForBuildStart waits for the dut_leaser build with the given id to start and
// returns the unqualified hostname of the DUT running the task.
func (c *leaseDutRun) waitForBuildStart(ctx context.Context, client *bb.Client, id int64) (string, error) {
for {
build, err := client.GetBuild(ctx, id)
if err != nil {
return "", err
}
switch s := build.Status; s {
case buildbucket_pb.Status_SCHEDULED:
time.Sleep(10 * time.Second)
case buildbucket_pb.Status_STARTED:
dutName := build.DUTName
if dutName == "" {
return "", errors.Reason("No dut_name for build %d", id).Err()
}
return dutName, nil
default:
return "", errors.Reason("Got unexpected build status %s", buildbucket_pb.Status_name[int32(s)]).Err()
}
}
}
// exactlyOne counts the number of true booleans and returns whether it is exactly one
func exactlyOne(bools ...bool) bool {
count := 0
for _, b := range bools {
if b {
count++
}
if count > 1 {
return false
}
}
return count == 1
}
// dutNameToFQDN converts a dutName of the form "HOSTNAME" to "HOSTNAME.cros.corp.google.com".
// dutNameToFQDN assumes that a dutName is a valid DUT name, if it is not, then the output is arbitrary.
func dutNameToFQDN(dutName string) string {
return fmt.Sprintf("%s.cros.corp.google.com", dutName)
}
// convertTags takes a map of dimensions and converts it to a slice of strings in
// swarming dimension format.
func dimsToTags(dims *map[string]string) []string {
tags := make([]string, len(*dims))
i := 0
for k, v := range *dims {
tags[i] = fmt.Sprintf("%s:%s", k, v)
i++
}
return tags
}
// mergeDims merges two sets of dimensions, taking values from the first set if there is key overlap.
func mergeDims(first, second *map[string]string) map[string]string {
merged := make(map[string]string)
for k, v := range *first {
merged[k] = v
}
for k, v := range *second {
if _, keyExists := merged[k]; !keyExists {
merged[k] = v
}
}
return merged
}