blob: c479b45a967ba2fa566c2af5b3d9d16877987d9b [file] [log] [blame]
#!/usr/bin/python
# Copyright 2015 Google Inc. All Rights Reserved.
# pylint: disable=F0401
"""Start, restart and shut down masters as needed."""
import argparse
import logging
import os
import re
import socket
import subprocess
import sys
from functools import partial
from infra.libs.buildbot import master
from infra.libs.service_utils import daemon
from infra.libs.service_utils import outer_loop
from infra.services.master_lifecycle import buildbot_state
from infra_libs import logs
from infra_libs import ts_mon
run_count = ts_mon.CounterMetric(
'master_manager/run_count',
'Count the number of state machine runs.',
[ts_mon.StringField('result'), ts_mon.StringField('action')])
def parse_args(argv):
parser = argparse.ArgumentParser(
description='Manage the state of a buildbot master. NOTE: Does nothing '
'unless --prod is specified')
parser.add_argument('directory', nargs='?',
help='location of the master to manage')
parser.add_argument('desired_state', nargs='?',
choices=buildbot_state.STATES['desired_buildbot_state'],
help='the desired state of the master')
parser.add_argument('transition_time_utc', nargs='?', type=float,
help='seconds since the UTC epoch to trigger the state')
parser.add_argument('--list-all-states', action='store_true',
help='list all states with their actions and exit')
parser.add_argument('--builder-filter', action='append', default=[],
help='appends a Python regular expression to the list of builder '
'filters. By default, all builders count as building; if builder '
'filters are supplied, only builders that match at least one filter '
'will be counted.')
parser.add_argument('--drain-timeout', metavar='SECONDS', type=int,
default=buildbot_state.DEFAULT_DRAIN_TIMEOUT_SEC,
help='sets the drain state timeout, in seconds.')
parser.add_argument('--enable-gclient-sync', action='store_true',
help='perform a gclient sync before every master start')
parser.add_argument('--emergency-file',
default='.stop_master_lifecycle',
help='filename of the emergency stop file. if this file is found in the '
'master directory, exit immediately')
parser.add_argument('--hostname',
default=socket.getfqdn(),
help='override local hostname (currently %(default)s)')
parser.add_argument('--prod', action='store_true',
help='actually run commands instead of printing them.')
parser.add_argument('--loop', action='store_true',
help='repeatedly run the state machine. will not terminate unless killed')
parser.add_argument('--loop-sleep-secs', type=int, default=5,
help='how many seconds to wait between loop runs. default %(default)s')
parser.add_argument('--connection-timeout', type=int, default=30,
help='how many seconds to wait for a master http request before timing '
'out.')
outer_loop.add_argparse_options(parser)
logs.add_argparse_options(parser)
ts_mon.add_argparse_options(parser)
parser.set_defaults(
ts_mon_target_type='task',
ts_mon_task_job_name='unset', # Will be overwritten with master name.
ts_mon_task_service_name='master_manager',
ts_mon_flush_mode='manual',
)
args = parser.parse_args(argv)
if not args.list_all_states:
if not args.directory:
parser.error('A master directory must be specified.')
if not args.transition_time_utc:
parser.error('A transition time must be specified.')
if args.ts_mon_task_job_name == 'unset':
abs_master_directory = os.path.abspath(args.directory)
args.ts_mon_task_job_name = abs_master_directory.split('/')[-1]
logs.process_argparse_options(args)
ts_mon.process_argparse_options(args)
return args
def master_hostname_is_valid(local_hostname, abs_master_directory, logger):
master_hostname = master.get_mastermap_data(
abs_master_directory)['fullhost']
if master_hostname != local_hostname: # pragma: no cover
logger.error('%s does not match %s, aborting. use --hostname to override.',
local_hostname, master_hostname)
return False
return True
def run_state_machine_pass(
logger, matchlist, abs_master_directory, emergency_file, desired_state,
transition_time_utc, enable_gclient_sync, prod, connection_timeout,
hostname, builder_filters):
if os.path.exists(os.path.join(
abs_master_directory, emergency_file)): # pragma: no cover
logger.error('%s detected in %s, aborting!',
emergency_file, abs_master_directory)
run_count.increment(fields={'result': 'failure', 'action': 'none'})
return 1
if not master_hostname_is_valid(
hostname, abs_master_directory, logger): # pragma: no cover
run_count.increment(fields={'result': 'failure', 'action': 'none'})
return 1
evidence = buildbot_state.collect_evidence(
abs_master_directory,
connection_timeout=connection_timeout,
builder_filters=builder_filters)
evidence['desired_buildbot_state'] = {
'desired_state': desired_state,
'transition_time_utc': transition_time_utc,
}
state, action_name, action_items = matchlist.execution_list(evidence)
execution_list = list(
master.convert_action_items_to_cli(
action_items, abs_master_directory,
enable_gclient=enable_gclient_sync))
logger.info('%s: current state: %s', abs_master_directory, state)
logger.info('%s: performing action: %s', abs_master_directory, action_name)
if execution_list: # pragma: no branch
if prod:
logger.info('production run, executing:')
else:
logger.info('dry run, not executing:')
for cmd in execution_list:
logger.info('* %s (in %s)', cmd['cmd'], cmd['cwd'])
if prod:
try:
with daemon.flock(cmd['lockfile']):
subprocess.check_call( # pragma: no branch
[str(x) for x in cmd['cmd']],
cwd=cmd['cwd'],
close_fds=True)
except daemon.LockAlreadyLocked: # pragma: no cover
logger.warn(' lock on %s could not be acquired, no action taken.',
cmd['lockfile'])
else: # pragma: no cover
logger.info('no action to be taken.')
run_count.increment(fields={'result': 'success', 'action': action_name})
return 0
def run(argv):
args = parse_args(argv)
logger = logging.getLogger(__name__)
logs.add_handler(logger)
matchlist = buildbot_state.construct_pattern_matcher(
drain_timeout_sec=args.drain_timeout)
if args.list_all_states: # pragma: no cover
matchlist.print_all_states()
return 0
abs_master_directory = os.path.abspath(args.directory)
builder_filters = [re.compile(f) for f in args.builder_filter]
state_machine = partial(run_state_machine_pass, logger,
matchlist, abs_master_directory, args.emergency_file,
args.desired_state, args.transition_time_utc, args.enable_gclient_sync,
args.prod, args.connection_timeout, args.hostname, builder_filters)
if args.loop: # pragma: no cover
loop_opts = outer_loop.process_argparse_options(args)
outer_loop.loop(
state_machine, lambda: args.loop_sleep_secs, **loop_opts)
else:
return state_machine()
return 0 # pragma: no cover
def main(): # pragma: no cover
ret = run(sys.argv[1:])
ts_mon.flush()
return ret
if __name__ == '__main__':
sys.exit(main())