| # Copyright 2014 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Log status events to a file on disk or event collection endpoint.""" |
| |
| |
| import json |
| import logging |
| import logging.handlers |
| import os |
| import re |
| import time |
| |
| from logging.handlers import TimedRotatingFileHandler |
| |
| import buildbot.status.results |
| |
| from buildbot.status.base import StatusReceiverMultiService |
| from twisted.python import log as twisted_log |
| |
| from common import chromium_utils |
| |
| from infra_libs import ts_mon |
| |
| step_field_spec = [ |
| ts_mon.StringField('builder'), |
| ts_mon.StringField('master'), |
| ts_mon.StringField('project_id'), |
| ts_mon.StringField('result'), |
| ts_mon.StringField('slave'), |
| ts_mon.StringField('step_name'), |
| ts_mon.StringField('subproject_tag'), |
| ] |
| |
| step_durations = ts_mon.CumulativeDistributionMetric( |
| 'buildbot/master/builders/steps/durations', |
| 'Time (in seconds) from step start to step end', |
| step_field_spec, |
| units=ts_mon.MetricsDataUnits.SECONDS, |
| # Use fixed-width bucketer up to 2.7 hours with 10-second precision. |
| bucketer=ts_mon.FixedWidthBucketer(10, 1000)) |
| |
| step_counts = ts_mon.CounterMetric( |
| 'buildbot/master/builders/steps/count', |
| 'Count of step results, per builder and step', |
| step_field_spec) |
| |
| field_spec = [ |
| ts_mon.StringField('builder'), |
| ts_mon.StringField('master'), |
| ts_mon.StringField('project_id'), |
| ts_mon.StringField('result'), |
| ts_mon.StringField('slave'), |
| ts_mon.StringField('subproject_id'), |
| ] |
| |
| result_count = ts_mon.CounterMetric('buildbot/master/builders/results/count', |
| 'Number of items consumed from ts_mon.log by mastermon', |
| field_spec) |
| # A custom bucketer with 12% resolution in the range of 1..10**5, |
| # better suited for build cycle times. |
| bucketer = ts_mon.GeometricBucketer( |
| growth_factor=10**0.05, num_finite_buckets=100) |
| cycle_times = ts_mon.CumulativeDistributionMetric( |
| 'buildbot/master/builders/builds/durations', |
| 'Durations (in seconds) that slaves spent actively doing work towards ' |
| 'builds for each builder', |
| field_spec, |
| bucketer=bucketer) |
| pending_times = ts_mon.CumulativeDistributionMetric( |
| 'buildbot/master/builders/builds/pending_durations', |
| 'Durations (in seconds) that the master spent waiting for slaves to become ' |
| 'available for each builder', |
| field_spec, |
| bucketer=bucketer) |
| total_times = ts_mon.CumulativeDistributionMetric( |
| 'buildbot/master/builders/builds/total_durations', |
| 'Total duration (in seconds) that builds took to complete for each builder', |
| field_spec, |
| bucketer=bucketer) |
| |
| pre_test_times = ts_mon.CumulativeDistributionMetric( |
| 'buildbot/master/builders/builds/pre_test_durations', |
| 'Durations (in seconds) that builds spent before their "before_tests" step', |
| field_spec, |
| bucketer=bucketer) |
| |
| |
| class StatusEventLogger(StatusReceiverMultiService): |
| """Log status events to a file on disk or event collection endpoint. |
| |
| Files on disk are rotated, while the event collection endpoint is contacted |
| through a script in the infra/infra repository (separate checkout). |
| |
| A file, .logstatus, is used to configure the logger. If it exists then |
| file logging is enabled. If it parses as json, the keys event_logging, |
| file_logging, logging_ignore_basedir, logfile, can be used to configure the |
| logger at runtime. |
| """ |
| |
| DEFAULT_LOGGING_IGNORE_BASEDIR = False |
| |
| def __init__(self, basedir=None, event_logging_dir=None, |
| event_logfile=None, ts_mon_logfile=None, |
| logging_ignore_basedir=None, event_logging=None): |
| """Create a StatusEventLogger. |
| |
| Args: |
| basedir: the basedir of the configuration and log files. Set to the |
| service's parent directory by default, mainly overridden for |
| testing. |
| event_logging_dir: directory where to write events. This object adds the |
| master name to the path. Mainly overridden for testing. |
| event_logfile: file name for writing events. |
| ts_mon_logfile: file name for writing ts_mon metrics. |
| logging_ignore_basedir (bool): when True, do not prepend |
| event_logging_dir to event_logfile and ts_mon_logfile. |
| event_logging (bool or None): enables logging events to event pipeline. |
| Default: enabled. |
| """ |
| self._basedir = basedir |
| self.master_dir = os.path.basename(os.path.abspath(os.curdir)) |
| |
| self.logging_ignore_basedir = ( |
| logging_ignore_basedir or self.DEFAULT_LOGGING_IGNORE_BASEDIR) |
| |
| self._event_logging_dir = os.path.join( |
| event_logging_dir or '/var/log/chrome-infra', |
| 'status_logger-' + self.master_dir) |
| |
| event_logfile = event_logfile or 'events.log' |
| ts_mon_logfile = ts_mon_logfile or 'ts_mon.log' |
| |
| if not self.logging_ignore_basedir: |
| event_logfile = os.path.join(self._event_logging_dir, event_logfile) |
| ts_mon_logfile = os.path.join(self._event_logging_dir, ts_mon_logfile) |
| |
| self._event_logfile = event_logfile |
| self._ts_mon_logfile = ts_mon_logfile |
| |
| # Will be initialized in startService. |
| self.event_logger = None |
| self.ts_mon_logger = None |
| self.status = None |
| self._last_checked_active = 0 |
| self._event_logging = event_logging if event_logging is not None else True |
| # Can't use super because StatusReceiverMultiService is an old-style class. |
| StatusReceiverMultiService.__init__(self) |
| |
| def as_dict(self): |
| return { |
| 'basedir': self.basedir, |
| 'event_logging': self._event_logging, |
| 'logging_ignore_basedir': self.logging_ignore_basedir, |
| } |
| |
| @staticmethod |
| def _get_requested_at_millis(build): |
| return int(build.getProperty('requestedAt') * 1000) |
| |
| @property |
| def basedir(self): |
| """Returns dynamic or preset basedir. |
| |
| self.parent doesn't exist until the service is running, so this has to be |
| here instead of precomputing the logfile in __init__. |
| """ |
| return self._basedir or self.parent.basedir |
| |
| def _canonical_file(self, filename, ignore_basedir=False): |
| """Returns an absolute path for a config or log file.""" |
| if ignore_basedir: |
| full_filename = filename |
| else: |
| full_filename = os.path.join(self.basedir, filename) |
| return chromium_utils.AbsoluteCanonicalPath(full_filename) |
| |
| def send_build_result( |
| self, scheduled, started, finished, builder_name, bot_name, result, |
| project_id=None, subproject_tag=None, steps=None, pre_test_time_s=None): |
| """Log a build result for ts_mon. |
| |
| This allows computing metrics for builds in mastermon. |
| """ |
| d = { |
| 'timestamp_ms': finished * 1000, |
| 'builder': builder_name, |
| 'slave': bot_name, |
| 'result': result.lower(), |
| 'duration_s': finished - started, |
| 'pending_s': started - scheduled, |
| 'total_s': finished - scheduled, |
| } |
| if project_id: |
| d['project_id'] = project_id |
| if subproject_tag: |
| d['subproject_tag'] = subproject_tag |
| if steps: |
| d['steps'] = steps |
| if pre_test_time_s is not None: |
| d['pre_test_time_s'] = pre_test_time_s |
| self.ts_mon_logger.info(json.dumps(d)) |
| |
| def send_step_result( |
| self, timestamp, builder_name, bot_name, |
| step_result, project_id, subproject_tag): |
| """Log step results for ts_mon |
| |
| Args: |
| timestamp(int): when the event was generated (end of a step). Seconds |
| since the Unix epoch. |
| builder_name(str): name of the builder the steps are part of. |
| bot_name(str): name of the machine running the build. |
| step_result (str): result for this step. |
| project_id(str): 'project' as shown on the codereview. |
| subproject_tag(str): a mention of a subproject. Mostly used to distinguish |
| between chromium and blink CLs in the chromium project. |
| """ |
| # The presence of the field 'step_result' is how the collecting daemon |
| # tells the difference between this case and the one from send_build_result. |
| d = { |
| 'timestamp': timestamp, |
| 'builder': builder_name, |
| 'step_result': step_result, |
| 'slave': bot_name, |
| } |
| |
| if project_id: |
| d['project_id'] = project_id |
| if subproject_tag: |
| d['subproject_tag'] = subproject_tag |
| |
| self.ts_mon_logger.info(json.dumps(d)) |
| |
| |
| def send_build_event(self, timestamp_kind, timestamp, build_event_type, |
| bot_name, builder_name, build_number, build_scheduled_ts, |
| step_name=None, step_text=None, step_number=None, |
| result=None, extra_result_code=None, patch_url=None, |
| bbucket_id=None, category=None, fail_type=None, |
| head_revision_git_hash=None): |
| """Log a build/step event for event_mon.""" |
| |
| if self._event_logging: |
| # List options to pass to send_monitoring_event, without the --, to save |
| # a bit of space. |
| d = {'event-mon-timestamp-kind': timestamp_kind, |
| 'event-mon-event-timestamp': timestamp, |
| 'event-mon-service-name': 'buildbot/master/%s' % self.master_dir, |
| 'build-event-type': build_event_type, |
| 'build-event-hostname': bot_name, |
| 'build-event-build-name': builder_name, |
| 'build-event-build-number': build_number, |
| 'build-event-build-scheduling-time': build_scheduled_ts, |
| } |
| if step_name: |
| d['build-event-step-name'] = step_name |
| d['build-event-step-text'] = step_text |
| d['build-event-step-number'] = step_number |
| if result: |
| d['build-event-result'] = result.upper() |
| if extra_result_code: |
| d['build-event-extra-result-code'] = extra_result_code |
| if patch_url: |
| d['build-event-patch-url'] = patch_url |
| if bbucket_id: |
| d['build-event-bbucket-id'] = bbucket_id |
| if category: |
| d['build-event-category'] = category |
| if fail_type: |
| d['build-event-fail-type'] = fail_type |
| if head_revision_git_hash: |
| d['build-event-head-revision-git-hash'] = head_revision_git_hash |
| |
| self.event_logger.info(json.dumps(d)) |
| |
| def _create_logging_dir(self): |
| """Make sure the logging directory exists. |
| |
| Try to create the directory if it doesn't exist, returns False if it |
| fails. |
| |
| Returns: |
| logs_dir_exists(bool): True is the directory is available |
| |
| """ |
| event_logging_dir_exists = os.path.isdir(self._event_logging_dir) |
| if not event_logging_dir_exists: |
| try: |
| os.mkdir(self._event_logging_dir) |
| except OSError: |
| twisted_log.msg('Logging directory cannot be created, no events will ' |
| 'be written:', self._event_logging_dir) |
| else: |
| event_logging_dir_exists = True |
| |
| return event_logging_dir_exists |
| |
| def _create_ts_mon_logger(self): |
| """Set up a logger for ts_mon events. |
| |
| If the destination directory does not exist, ignore data sent to |
| ts_mon_logger. |
| """ |
| |
| event_logging_dir_exists = self._create_logging_dir() |
| logger = logging.getLogger(__name__ + '_ts_mon') |
| # Remove handlers that may already exist. This is useful when changing the |
| # log file name. |
| for handler in logger.handlers: |
| handler.flush() |
| logger.handlers = [] |
| |
| logger.propagate = False |
| logger.setLevel(logging.INFO) |
| formatter = logging.Formatter('%(message)s') |
| |
| if event_logging_dir_exists: |
| # Use delay=True so we don't unnecessary open a new file. |
| # Also use WatchedFileHandler because it'll be rotated by an external |
| # process. |
| handler = logging.handlers.WatchedFileHandler(self._ts_mon_logfile, |
| encoding='utf-8', |
| delay=True) |
| else: |
| handler = logging.NullHandler() |
| handler.setFormatter(formatter) |
| logger.addHandler(handler) |
| self.ts_mon_logger = logger |
| |
| def _create_event_logger(self): |
| """Set up a logger for monitoring events. |
| |
| If the destination directory does not exist, ignore data sent to |
| event_logger. |
| """ |
| event_logging_dir_exists = self._create_logging_dir() |
| |
| logger = logging.getLogger(__name__ + '_event') |
| # Remove handlers that may already exist. This is useful when changing the |
| # log file name. |
| for handler in logger.handlers: |
| handler.flush() |
| logger.handlers = [] |
| |
| logger.propagate = False |
| logger.setLevel(logging.INFO) |
| formatter = logging.Formatter('%(message)s') |
| |
| if event_logging_dir_exists: |
| # Use delay=True so we don't unnecessarily open a new file. |
| handler = TimedRotatingFileHandler(self._event_logfile, backupCount=1440, |
| when='M', interval=1, delay=True) |
| else: |
| handler = logging.NullHandler() |
| handler.setFormatter(formatter) |
| logger.addHandler(handler) |
| self.event_logger = logger |
| |
| def _get_patch_url(self, build_properties): |
| patch_url = None |
| if build_properties.getProperty('patch_storage') == 'gerrit': |
| patch_url = '%s/c/%s/+/%s/%s' % ( |
| build_properties.getProperty('patch_gerrit_url'), |
| build_properties.getProperty('patch_project'), |
| build_properties.getProperty('patch_issue'), |
| build_properties.getProperty('patch_set')) |
| return patch_url |
| |
| def _get_bbucket_id(self, build_properties): |
| """Retrieves buildbucket id from build properties. |
| |
| Returns None when buildbucket properties are missing or malformed. |
| """ |
| if 'buildbucket' not in build_properties: |
| return |
| |
| try: |
| bbucket_props = json.loads( |
| build_properties.getProperty('buildbucket')) |
| except (ValueError, TypeError) as e: |
| twisted_log.msg('Failed to parse buildbucket property as JSON: %s' % e) |
| return |
| |
| if isinstance(bbucket_props, dict): |
| build = bbucket_props.get('build', {}) |
| if isinstance(build, dict): |
| return build.get('id') |
| else: |
| twisted_log.msg('Build inside buildbucket property is not a dict') |
| else: |
| twisted_log.msg('Buildbucket property is not a dict') |
| |
| def _get_head_revision_git_hash(self, build_properties): |
| if 'got_revision' in build_properties: |
| revision = build_properties.getProperty('got_revision').lower() |
| if re.match('^[0-9a-f]{40}$', revision): |
| return revision |
| |
| |
| def startService(self): |
| """Start the service and subscribe for updates.""" |
| self._create_event_logger() |
| self._create_ts_mon_logger() |
| |
| StatusReceiverMultiService.startService(self) |
| self.status = self.parent.getStatus() |
| self.status.subscribe(self) |
| |
| # Unused, but is required by the API to enable other events. |
| def builderAdded(self, builderName, builder): |
| # Must return self in order to subscribe to builderChangedState and |
| # buildStarted/Finished events. |
| return self |
| |
| def buildStarted(self, builderName, build): |
| build_number = build.getNumber() |
| bot = build.getSlavename() |
| started, _ = build.getTimes() |
| properties = build.getProperties() |
| self.send_build_event( |
| 'BEGIN', started * 1000, 'BUILD', bot, builderName, build_number, |
| self._get_requested_at_millis(build), |
| patch_url=self._get_patch_url(properties), |
| bbucket_id=self._get_bbucket_id(properties), |
| category=properties.getProperty('category')) |
| # Must return self in order to subscribe to stepStarted/Finished events. |
| return self |
| |
| def stepStarted(self, build, step): |
| bot = build.getSlavename() |
| builder_name = build.getBuilder().name |
| build_number = build.getNumber() |
| step_name = step.getName() |
| step_text = ' '.join(step.getText()) |
| started, _ = step.getTimes() |
| properties = build.getProperties() |
| self.send_build_event( |
| 'BEGIN', started * 1000, 'STEP', bot, builder_name, build_number, |
| self._get_requested_at_millis(build), |
| step_name=step_name, step_text=step_text, step_number=step.step_number, |
| patch_url=self._get_patch_url(properties), |
| bbucket_id=self._get_bbucket_id(properties), |
| category=properties.getProperty('category')) |
| # Must return self in order to subscribe to logStarted/Finished events. |
| return self |
| |
| def stepFinished(self, build, step, results): |
| builder_name = build.getBuilder().name |
| build_number = build.getNumber() |
| bot = build.getSlavename() |
| step_name = step.getName() |
| step_text = ' '.join(step.getText()) |
| started, finished = step.getTimes() |
| properties = build.getProperties() |
| self.send_build_event( |
| 'END', finished * 1000, 'STEP', bot, builder_name, build_number, |
| self._get_requested_at_millis(build), |
| step_name=step_name, step_text=step_text, step_number=step.step_number, |
| result=buildbot.status.results.Results[results[0]], |
| patch_url=self._get_patch_url(properties), |
| bbucket_id=self._get_bbucket_id(properties), |
| category=properties.getProperty('category')) |
| |
| # Send step result to ts-mon |
| properties = build.getProperties() |
| project_id = properties.getProperty('patch_project') |
| subproject_tag = properties.getProperty('subproject_tag') |
| step_result = buildbot.status.results.Results[step.getResults()[0]] |
| |
| self.send_step_result( |
| finished, |
| builder_name, |
| bot, |
| step_result, |
| project_id, |
| subproject_tag) |
| |
| if re.match('bot_update|update_scripts', step_name): |
| values = { |
| 'slave': bot, |
| 'project_id': project_id, |
| 'builder': builder_name, |
| 'result': step_result, |
| 'subproject_tag': subproject_tag, |
| 'step_name': step_name.split()[0], |
| 'master': self.master_dir |
| } |
| fields = { key: value if value is not None else '' |
| for key, value in values.iteritems() } |
| step_durations.add(finished - started, fields=fields) |
| step_counts.increment(fields=fields) |
| |
| def buildFinished(self, builderName, build, results): |
| build_number = build.getNumber() |
| bot = build.getSlavename() |
| started, finished = build.getTimes() |
| |
| # Calculate when build was scheduled if possible. Use build started |
| # timestamp as initial approximation. |
| scheduled = started |
| source_stamp = build.getSourceStamp() |
| if source_stamp and source_stamp.changes: |
| scheduled = source_stamp.changes[0].when |
| |
| properties = build.getProperties() |
| extra_result_code = properties.getProperty('extra_result_code') |
| |
| self.send_build_event( |
| 'END', finished * 1000, 'BUILD', bot, builderName, build_number, |
| self._get_requested_at_millis(build), |
| result=buildbot.status.results.Results[results], |
| extra_result_code=extra_result_code, |
| patch_url=self._get_patch_url(properties), |
| bbucket_id=self._get_bbucket_id(properties), |
| category=properties.getProperty('category'), |
| fail_type=properties.getProperty('failure_type'), |
| head_revision_git_hash=self._get_head_revision_git_hash(properties)) |
| |
| pre_test_time_s = None |
| for step in build.getSteps(): |
| if step.getName() == 'mark: before_tests': |
| step_started, _ = step.getTimes() |
| pre_test_time_s = step_started - started |
| |
| # It's important that the recipe does not generate unbounded number |
| # of step names (e.g. one for each git revision), to avoid stream |
| # explosion in the monitoring system. Another alternative is for the recipe |
| # to clearly mark such dynamic steps - e.g. add "(dynamic)" to the name, |
| # and exclude such steps here. |
| WHITELISTED_RECIPES = [ |
| 'chromium_trybot', |
| ] |
| steps_to_send = [] |
| if properties.getProperty('recipe') in WHITELISTED_RECIPES: |
| for step in build.getSteps(): |
| step_started, step_finished = step.getTimes() |
| steps_to_send.append({ |
| 'step_name': step.getName(), |
| 'duration_s': step_finished - step_started, |
| 'result': buildbot.status.results.Results[step.getResults()[0]], |
| }) |
| |
| # If property doesn't exist, this function returns None. |
| # Note: this is not true for build.getProperty(), it raises KeyError. |
| project_id = properties.getProperty('patch_project') |
| subproject_tag = properties.getProperty('subproject_tag') |
| |
| fields = { |
| 'master': 'deprecated', |
| 'builder': builderName, |
| 'slave': bot, |
| 'result': buildbot.status.results.Results[results].lower(), |
| 'project_id': project_id if project_id else 'unknown', |
| 'subproject_id': subproject_tag if subproject_tag else 'unknown', |
| } |
| result_count.increment(fields) |
| cycle_times.add(finished - started, fields=fields) |
| pending_times.add(started - scheduled, fields=fields) |
| total_times.add(finished - scheduled, fields=fields) |
| if pre_test_time_s: |
| pre_test_times.add(pre_test_time_s, fields=fields) |
| |
| # TODO(sergeyberezin): remove this when all masters are restarted, |
| # and all graphs and alerts are migrated to the new metrics. Do |
| # this before turning down mastermon - to avoid accumulating logs. |
| self.send_build_result( |
| scheduled, started, finished, builderName, bot, |
| buildbot.status.results.Results[results], |
| project_id, subproject_tag, steps=steps_to_send, |
| pre_test_time_s=pre_test_time_s) |