blob: 94619c64a4a87157be626958ea33019d1d57b87d [file] [log] [blame]
# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from __future__ import print_function
import json
import logging
import os
import socket
import sys
import re
import requests
from infra_libs.ts_mon.common import interface
from infra_libs.ts_mon.common import metric_store
from infra_libs.ts_mon.common import monitors
from infra_libs.ts_mon.common import standard_metrics
from infra_libs.ts_mon.common import targets
def load_machine_config(filename):
if not os.path.exists(filename):
logging.info('Configuration file does not exist, ignoring: %s', filename)
return {}
try:
with open(filename) as fh:
return json.load(fh)
except Exception:
logging.error('Configuration file couldn\'t be read: %s', filename)
raise
def _default_region(fqdn):
# Check if we're running in a GCE instance.
try:
r = requests.get(
'http://metadata.google.internal/computeMetadata/v1/instance/zone',
headers={'Metadata-Flavor': 'Google'},
timeout=1.0)
except requests.exceptions.RequestException:
pass
else:
if r.status_code == requests.codes.ok:
# The zone is the last slash-separated component.
return r.text.split('/')[-1]
try:
return fqdn.split('.')[1] # [chrome|golo]
except IndexError:
return ''
def _default_network(host):
try:
# Regular expression that matches the vast majority of our host names.
# Matches everything of the form 'masterN', 'masterNa', and 'foo-xN'.
return re.match(r'^([\w-]*?-[acm]|master)(\d+)a?$', host).group(2) # N
except AttributeError:
return ''
def add_argparse_options(parser):
"""Add monitoring related flags to a process' argument parser.
Args:
parser (argparse.ArgumentParser): the parser for the main process.
"""
if sys.platform == 'win32': # pragma: no cover
default_config_file = 'C:\\chrome-infra\\ts-mon.json'
else: # pragma: no cover
default_config_file = '/etc/chrome-infra/ts-mon.json'
parser = parser.add_argument_group('Timeseries Monitoring Options')
parser.add_argument(
'--ts-mon-config-file',
default=default_config_file,
help='path to a JSON config file that contains suitable values for '
'"endpoint" and "credentials" for this machine. This config file is '
'intended to be shared by all processes on the machine, as the '
'values depend on the machine\'s position in the network, IP '
'whitelisting and deployment of credentials. (default: %(default)s)')
parser.add_argument(
'--ts-mon-endpoint',
help='url (file:// or https://) to post monitoring metrics to. If set, '
'overrides the value in --ts-mon-config-file')
parser.add_argument(
'--ts-mon-credentials',
help='path to a pkcs8 json credential file. If set, overrides the value '
'in --ts-mon-config-file')
parser.add_argument(
'--ts-mon-ca-certs',
help='path to file containing root CA certificates for SSL server '
'certificate validation. If not set, a CA cert file bundled with '
'httplib2 is used.')
parser.add_argument(
'--ts-mon-flush',
choices=('manual', 'auto'), default='auto',
help=('metric push behavior: manual (only send when flush() is called), '
'or auto (send automatically every --ts-mon-flush-interval-secs '
'seconds). (default: %(default)s)'))
parser.add_argument(
'--ts-mon-flush-interval-secs',
type=int,
default=60,
help=('automatically push metrics on this interval if '
'--ts-mon-flush=auto.'))
parser.add_argument(
'--ts-mon-autogen-hostname',
action="store_true",
help=('Indicate that the hostname is autogenerated. '
'This option must be set on autoscaled GCE VMs, Kubernetes pods, '
'or any other hosts with dynamically generated names.'))
parser.add_argument(
'--ts-mon-target-type',
choices=('device', 'task'),
default='device',
help='the type of target that is being monitored ("device" or "task").'
' (default: %(default)s)')
fqdn = socket.getfqdn().lower() # foo-[a|m]N.[chrome|golo].chromium.org
host = fqdn.split('.')[0] # foo-[a|m]N
region = _default_region(fqdn)
network = _default_network(host)
parser.add_argument(
'--ts-mon-device-hostname',
default=host,
help='name of this device, (default: %(default)s)')
parser.add_argument(
'--ts-mon-device-region',
default=region,
help='name of the region this devices lives in. (default: %(default)s)')
parser.add_argument(
'--ts-mon-device-role',
default='default',
help='Role of the device. (default: %(default)s)')
parser.add_argument(
'--ts-mon-device-network',
default=network,
help='name of the network this device is connected to. '
'(default: %(default)s)')
parser.add_argument(
'--ts-mon-task-service-name',
help='name of the service being monitored')
parser.add_argument(
'--ts-mon-task-job-name',
help='name of this job instance of the task')
parser.add_argument(
'--ts-mon-task-region',
default=region,
help='name of the region in which this task is running '
'(default: %(default)s)')
parser.add_argument(
'--ts-mon-task-hostname',
default=host,
help='name of the host on which this task is running '
'(default: %(default)s)')
parser.add_argument(
'--ts-mon-task-number', type=int, default=0,
help='number (e.g. for replication) of this instance of this task '
'(default: %(default)s)')
parser.add_argument(
'--ts-mon-metric-name-prefix',
default='/chrome/infra/',
help='metric name prefix for all metrics (default: %(default)s)')
parser.add_argument(
'--ts-mon-use-new-proto',
default=True, action='store_true',
help='deprecated and ignored')
def process_argparse_options(args):
"""Process command line arguments to initialize the global monitor.
Also initializes the default target.
Starts a background thread to automatically flush monitoring metrics if not
disabled by command line arguments.
Args:
args (argparse.Namespace): the result of parsing the command line arguments
"""
# Parse the config file if it exists.
config = load_machine_config(args.ts_mon_config_file)
endpoint = config.get('endpoint', '')
credentials = config.get('credentials', '')
autogen_hostname = config.get('autogen_hostname', False)
# Command-line args override the values in the config file.
if args.ts_mon_endpoint is not None:
endpoint = args.ts_mon_endpoint
if args.ts_mon_credentials is not None:
credentials = args.ts_mon_credentials
if args.ts_mon_target_type == 'device':
hostname = args.ts_mon_device_hostname
if args.ts_mon_autogen_hostname or autogen_hostname:
hostname = 'autogen:' + hostname
interface.state.target = targets.DeviceTarget(
args.ts_mon_device_region,
args.ts_mon_device_role,
args.ts_mon_device_network,
hostname)
if args.ts_mon_target_type == 'task':
# Reimplement ArgumentParser.error, since we don't have access to the parser
if not args.ts_mon_task_service_name:
print('Argument --ts-mon-task-service-name must be provided '
'when the target type is "task".',
file=sys.stderr)
sys.exit(2)
if not args.ts_mon_task_job_name:
print('Argument --ts-mon-task-job-name must be provided '
'when the target type is "task".',
file=sys.stderr)
sys.exit(2)
hostname = args.ts_mon_task_hostname
if args.ts_mon_autogen_hostname or autogen_hostname:
hostname = 'autogen:' + hostname
interface.state.target = targets.TaskTarget(
args.ts_mon_task_service_name,
args.ts_mon_task_job_name,
args.ts_mon_task_region,
hostname,
args.ts_mon_task_number)
interface.state.metric_name_prefix = args.ts_mon_metric_name_prefix
interface.state.global_monitor = monitors.NullMonitor()
if endpoint.startswith('file://'):
interface.state.global_monitor = monitors.DebugMonitor(
endpoint[len('file://'):])
elif endpoint.startswith('https://'):
interface.state.global_monitor = monitors.HttpsMonitor(
endpoint, monitors.CredentialFactory.from_string(credentials),
ca_certs=args.ts_mon_ca_certs)
elif endpoint.lower() == 'none':
logging.info('ts_mon monitoring has been explicitly disabled')
else:
logging.error('ts_mon monitoring is disabled because the endpoint provided'
' is invalid or not supported: %s', endpoint)
interface.state.flush_mode = args.ts_mon_flush
if args.ts_mon_flush == 'auto':
interface.state.flush_thread = interface._FlushThread(
args.ts_mon_flush_interval_secs)
interface.state.flush_thread.start()
standard_metrics.init()