blob: df1029a88054369a421f5ca8cfd48c40b6d4c33a [file] [log] [blame]
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import codecs
import collections
import gzip
import itertools
import json
import logging
import os
import sys
# Add tracing/ to the path.
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import six
from tracing_build import html2trace, trace2html
try:
StringTypes = six.string_types # pylint: disable=invalid-name
except NameError:
StringTypes = str
GZIP_FILENAME_SUFFIX = '.gz'
HTML_FILENAME_SUFFIX = '.html'
# Relevant trace event phases. See
# https://code.google.com/p/chromium/codesearch#chromium/src/base/trace_event/common/trace_event_common.h.
METADATA_PHASE = 'M'
MEMORY_DUMP_PHASE = 'v'
BEGIN_PHASE = 'B'
END_PHASE = 'E'
CLOCK_SYNC_EVENT_PHASE = 'c'
# Minimum gap between two consecutive merged traces in microseconds.
MIN_TRACE_GAP_IN_US = 1000000
# Rule for matching IDs in an IdMap. For a given level, |match| should be a
# named tuple class where its fields determine the importance of |entry._items|
# for the purposes of matching pairs of IdMap entries.
IdMapLevel = collections.namedtuple('IdMapLevel', ['name', 'match'])
class IdMap(object):
"""Abstract class for merging and mapping IDs from multiple sources."""
# Sub-classes must provide a tuple of |IdMapLevel| objects.
LEVELS = NotImplemented
def __init__(self, depth=0):
assert 0 <= depth <= len(self.LEVELS)
self._depth = depth
if depth > 0:
# Non-root node.
self._canonical_id = None
self._items = collections.defaultdict(set)
self._sources = set()
if depth < len(self.LEVELS):
# Non-leaf node.
self._entry_map = {} # (Source, Entry ID) -> Entry.
@property
def max_mapped_id(self):
"""The maximum mapped ID of this map's entries."""
if not self._entry_map:
return 0
return max(e._canonical_id for e in self._entry_map.values())
def AddEntry(self, source, path, **items):
"""Add a source-specific entry path to the map.
Args:
source: The source of the entry (e.g. trace filename).
path: A path of source-specific entry IDs in the map (e.g. [PID, TID]).
**items: Dictionary of items (or sets of items) to be appended to the
target entry's |_items|.
"""
if path:
return self._GetSubMapEntry(source, path[0]).AddEntry(source, path[1:],
**items)
assert 'id' not in items # ID is set according to the path.
for key, value in items.items():
value_set = self._items[key]
if (isinstance(value, collections.Iterable) and
not isinstance(value, StringTypes)):
value_set.update(value)
else:
value_set.add(value)
def MapEntry(self, source, path):
"""Map an source-specific entry ID path to a canonical entry ID path.
Args:
source: The source of the entry (e.g. trace filename).
path: A path of source-specific entry IDs in the map (e.g. [PID, TID]).
Returns:
A path of canonical entry IDs in the map to which the provided path of
source-specific entry IDs is mapped.
"""
if not path:
return ()
entry = self._entry_map[(source, path[0])]
return (entry._canonical_id,) + entry.MapEntry(source, path[1:])
def MergeEntries(self):
"""Recursively merge the entries in this map.
Example: Suppose that the following entries were added to the map:
map.AddEntry(source='trace_A.json', path=[10], name='Browser')
map.AddEntry(source='trace_A.json', path=[20], name='Renderer')
map.AddEntry(source='trace_B.json', path=[30], name='Browser')
Before merging, |map._entry_map| will contain three separate items:
('trace_A.json', 10) -> IdMap(_items={id: {10}, name: {'Browser'}},
_sources={'trace_A.json'})
('trace_A.json', 20) -> IdMap(_items={id: {20}, name: {'Renderer'}},
_sources={'trace_A.json'})
('trace_B.json', 30) -> IdMap(_items={id: {30}, name: {'Browser'}},
_sources={'trace_B.json'})
Since the first two entries come from the same source, they cannot be
merged. On the other hand, the third entry could be merged with either of
the first two. Since it has a common name with the first entry, it will be
merged with it in this method:
('trace_A.json', 10) -> IdMap(_items={id: {10, 30}, name: {'Browser'}},
_sources={'trace_A.json', 'trace_B.json'})
('trace_A.json', 20) -> IdMap(_items={id: {20}, name: {Renderer}},
_sources={'trace_A.json'})
('trace_B.json', 30) -> <same IdMap as ('trace_A.json', 10)>
Pairs of entries will be merged in a descending order of sizes of
pair-wise intersections of |entry._items| until there are no two entries
such that (1) they have at least one value in |entry._items| in common and
(2) they are mergeable (i.e. have no common source). Afterwards, unique IDs
are assigned to the resulting "canonical" entries and their sub-entries are
merged recursively.
"""
# pylint: disable=unsubscriptable-object
if self._depth == len(self.LEVELS):
return
logging.debug('Merging %r entries in %s...', self.LEVELS[self._depth].name,
self)
canonical_entries = self._CanonicalizeEntries()
self._AssignIdsToCanonicalEntries(canonical_entries)
for entry in canonical_entries:
entry.MergeEntries()
def _GetSubMapEntry(self, source, entry_id):
full_id = (source, entry_id)
entry = self._entry_map.get(full_id)
if entry is None:
entry = type(self)(self._depth + 1)
entry._sources.add(source)
entry._items['id'].add(entry_id)
self._entry_map[full_id] = entry
return entry
def _CalculateUnmergeableMapFromEntrySources(self):
entry_ids_by_source = collections.defaultdict(set)
for entry_id, entry in self._entry_map.items():
for source in entry._sources:
entry_ids_by_source[source].add(entry_id)
unmergeable_map = collections.defaultdict(set)
for unmergeable_set in entry_ids_by_source.values():
for entry_id in unmergeable_set:
unmergeable_map[entry_id].update(unmergeable_set - {entry_id})
return unmergeable_map
def _IsMergeableWith(self, other):
return self._sources.isdisjoint(other._sources)
def _GetMatch(self, other):
# pylint: disable=unsubscriptable-object
match_cls = self.LEVELS[self._depth - 1].match
return match_cls(*(self._items[f] & other._items[f]
for f in match_cls._fields))
def _MergeFrom(self, source):
if self._depth > 0:
# This is NOT a ROOT node, so we need to merge fields and sources from
# the source node.
for key, values in source._items.items():
self._items[key].update(values)
self._sources.update(source._sources)
if self._depth < len(self.LEVELS):
# This is NOT a LEAF node, so we need to copy over entries from the
# source node's entry map.
assert not set(self._entry_map) & set(source._entry_map)
self._entry_map.update(source._entry_map)
def _CanonicalizeEntries(self):
canonical_entries = self._entry_map.copy()
# {ID1, ID2} -> Match between the two entries.
matches = {frozenset([full_id1, full_id2]): entry1._GetMatch(entry2)
for full_id1, entry1 in canonical_entries.items()
for full_id2, entry2 in canonical_entries.items()
if entry1._IsMergeableWith(entry2)}
while matches:
# Pop the maximum match from the dictionary.
max_match_set, max_match = max(
list(matches.items()), key=lambda pair: [len(v) for v in pair[1]])
del matches[max_match_set]
canonical_full_id, merged_full_id = max_match_set
# Skip pairs of entries that have nothing in common.
if not any(max_match):
continue
# Merge the entries and update the map to reflect this.
canonical_entry = canonical_entries[canonical_full_id]
merged_entry = canonical_entries.pop(merged_full_id)
logging.debug('Merging %s into %s [match=%s]...', merged_entry,
canonical_entry, max_match)
canonical_entry._MergeFrom(merged_entry)
del merged_entry
self._entry_map[merged_full_id] = canonical_entry
for match_set in matches.keys():
if merged_full_id in match_set:
# Remove other matches with the merged entry.
del matches[match_set]
elif canonical_full_id in match_set:
[other_full_id] = match_set - {canonical_full_id}
other_entry = canonical_entries[other_full_id]
if canonical_entry._IsMergeableWith(other_entry):
# Update other matches with the canonical entry which are still
# mergeable.
matches[match_set] = canonical_entry._GetMatch(other_entry)
else:
# Remove other matches with the canonical entry which have become
# unmergeable.
del matches[match_set]
return list(canonical_entries.values())
def _AssignIdsToCanonicalEntries(self, canonical_entries):
assigned_ids = set()
canonical_entries_without_assigned_ids = set()
# Try to assign each canonical entry to one of the IDs from which it was
# merged.
for canonical_entry in canonical_entries:
candidate_ids = canonical_entry._items['id']
try:
assigned_id = next(candidate_id for candidate_id in candidate_ids
if candidate_id not in assigned_ids)
except StopIteration:
canonical_entries_without_assigned_ids.add(canonical_entry)
continue
assigned_ids.add(assigned_id)
canonical_entry._canonical_id = assigned_id
# For canonical entries where this cannot be done (highly unlikely), scan
# from the minimal merged ID upwards for the first unassigned ID.
for canonical_entry in canonical_entries_without_assigned_ids:
assigned_id = next(candidate_id for candidate_id in
itertools.count(min(canonical_entry._items['id']))
if candidate_id not in assigned_ids)
assigned_ids.add(assigned_id)
canonical_entry._canonical_id = assigned_id
def __repr__(self):
# pylint: disable=unsubscriptable-object
cls_name = type(self).__name__
if self._depth == 0:
return '%s root' % cls_name
else:
return '%s %s entry(%s)' % (cls_name, self.LEVELS[self._depth - 1].name,
self._items)
class ProcessIdMap(IdMap):
"""Class for merging and mapping PIDs and TIDs from multiple sources."""
LEVELS = (
IdMapLevel(name='process',
match=collections.namedtuple('ProcessMatch',
['name', 'id', 'label'])),
IdMapLevel(name='thread',
match=collections.namedtuple('ThreadMatch', ['name', 'id']))
)
def LoadTrace(filename):
"""Load a trace from a (possibly gzipped) file and return its parsed JSON."""
logging.info('Loading trace %r...', filename)
if filename.endswith(HTML_FILENAME_SUFFIX):
return LoadHTMLTrace(filename)
elif filename.endswith(GZIP_FILENAME_SUFFIX):
with gzip.open(filename, 'rb') as f:
return json.load(f)
else:
with open(filename, 'r') as f:
return json.load(f)
def LoadHTMLTrace(filename):
"""Load a trace from a vulcanized HTML trace file."""
trace_components = collections.defaultdict(list)
with open(filename) as file_handle:
for sub_trace in html2trace.ReadTracesFromHTMLFile(file_handle):
for name, component in TraceAsDict(sub_trace).items():
trace_components[name].append(component)
trace = {}
for name, components in trace_components.items():
if len(components) == 1:
trace[name] = components[0]
elif all(isinstance(component, list) for component in components):
trace[name] = [e for component in components for e in component]
else:
trace[name] = components[0]
logging.warning(
'Values of repeated trace component %r in HTML trace %r are not '
'lists. The first defined value of the component will be used.',
filename, name)
return trace
def SaveTrace(trace, filename):
"""Save a JSON trace to a (possibly gzipped) file."""
if filename is None:
logging.info('Dumping trace to standard output...')
print(json.dumps(trace))
else:
logging.info('Saving trace %r...', filename)
if filename.endswith(HTML_FILENAME_SUFFIX):
with codecs.open(filename, mode='w', encoding='utf-8') as f:
trace2html.WriteHTMLForTraceDataToFile([trace], 'Merged trace', f)
elif filename.endswith(GZIP_FILENAME_SUFFIX):
with gzip.open(filename, 'wb') as f:
json.dump(trace, f)
else:
with open(filename, 'w') as f:
json.dump(trace, f)
def TraceAsDict(trace):
"""Ensure that a trace is a dictionary."""
if isinstance(trace, list):
return {'traceEvents': trace}
return trace
def MergeTraceFiles(input_trace_filenames, output_trace_filename):
"""Merge a collection of input trace files into an output trace file."""
logging.info('Loading %d input traces...', len(input_trace_filenames))
input_traces = collections.OrderedDict()
for input_trace_filename in input_trace_filenames:
input_traces[input_trace_filename] = LoadTrace(input_trace_filename)
logging.info('Merging traces...')
output_trace = MergeTraces(input_traces)
logging.info('Saving output trace...')
SaveTrace(output_trace, output_trace_filename)
logging.info('Finished.')
def MergeTraces(traces):
"""Merge a collection of JSON traces into a single JSON trace."""
trace_components = collections.defaultdict(collections.OrderedDict)
for filename, trace in traces.items():
for name, component in TraceAsDict(trace).items():
trace_components[name][filename] = component
merged_trace = {}
for component_name, components_by_filename in trace_components.items():
logging.info('Merging %d %r components...', len(components_by_filename),
component_name)
merged_trace[component_name] = MergeComponents(component_name,
components_by_filename)
return merged_trace
def MergeComponents(component_name, components_by_filename):
"""Merge a component of multiple JSON traces into a single component."""
if component_name == 'traceEvents':
return MergeTraceEvents(components_by_filename)
else:
return MergeGenericTraceComponents(component_name, components_by_filename)
def MergeTraceEvents(events_by_filename):
"""Merge trace events from multiple traces into a single list of events."""
# Remove strings from the list of trace events
# (https://github.com/catapult-project/catapult/issues/2497).
events_by_filename = collections.OrderedDict(
(filename, [e for e in events if not isinstance(e, StringTypes)])
for filename, events in events_by_filename.items())
timestamp_range_by_filename = _AdjustTimestampRanges(events_by_filename)
process_map = _CreateProcessMapFromTraceEvents(events_by_filename)
merged_events = _CombineTraceEvents(events_by_filename, process_map)
_RemoveSurplusClockSyncEvents(merged_events)
merged_events.extend(
_BuildInjectedTraceMarkerEvents(timestamp_range_by_filename, process_map))
return merged_events
def _RemoveSurplusClockSyncEvents(events):
"""Remove all clock sync events except for the first one."""
# TODO(petrcermak): Figure out how to handle merging multiple clock sync
# events.
clock_sync_event_indices = [i for i, e in enumerate(events)
if e['ph'] == CLOCK_SYNC_EVENT_PHASE]
# The indices need to be traversed from largest to smallest (hence the -1).
for i in clock_sync_event_indices[:0:-1]:
del events[i]
def _AdjustTimestampRanges(events_by_filename):
logging.info('Adjusting timestamp ranges of traces...')
previous_trace_max_timestamp = 0
timestamp_range_by_filename = collections.OrderedDict()
for index, (filename,
events) in enumerate(list(events_by_filename.items()), 1):
# Skip metadata events, the timestamps of which are always zero.
non_metadata_events = [e for e in events if e['ph'] != METADATA_PHASE]
if not non_metadata_events:
logging.warning('Trace %r (%d/%d) only contains metadata events.',
filename, index, len(events_by_filename))
timestamp_range_by_filename[filename] = None
continue
min_timestamp = min(e['ts'] for e in non_metadata_events)
max_timestamp = max(e['ts'] for e in non_metadata_events)
# Determine by how much the timestamps should be shifted.
injected_timestamp_shift = max(
previous_trace_max_timestamp + MIN_TRACE_GAP_IN_US - min_timestamp, 0)
logging.info('Injected timestamp shift in trace %r (%d/%d): %d ms '
'[min=%d, max=%d, duration=%d].', filename, index,
len(events_by_filename), injected_timestamp_shift,
min_timestamp, max_timestamp, max_timestamp - min_timestamp)
if injected_timestamp_shift > 0:
# Shift the timestamps.
for event in non_metadata_events:
event['ts'] += injected_timestamp_shift
# Adjust the range.
min_timestamp += injected_timestamp_shift
max_timestamp += injected_timestamp_shift
previous_trace_max_timestamp = max_timestamp
timestamp_range_by_filename[filename] = min_timestamp, max_timestamp
return timestamp_range_by_filename
def _CreateProcessMapFromTraceEvents(events_by_filename):
logging.info('Creating process map from trace events...')
process_map = ProcessIdMap()
for filename, events in events_by_filename.items():
for event in events:
pid, tid = event['pid'], event['tid']
process_map.AddEntry(source=filename, path=(pid, tid))
if event['ph'] == METADATA_PHASE:
if event['name'] == 'process_name':
process_map.AddEntry(source=filename, path=(pid,),
name=event['args']['name'])
elif event['name'] == 'process_labels':
process_map.AddEntry(source=filename, path=(pid,),
label=event['args']['labels'].split(','))
elif event['name'] == 'thread_name':
process_map.AddEntry(source=filename, path=(pid, tid),
name=event['args']['name'])
process_map.MergeEntries()
return process_map
def _CombineTraceEvents(events_by_filename, process_map):
logging.info('Combining trace events from all traces...')
type_name_event_by_pid = {}
combined_events = []
for index, (filename,
events) in enumerate(list(events_by_filename.items()), 1):
for event in events:
if _UpdateTraceEventForMerge(event, process_map, filename, index,
type_name_event_by_pid):
combined_events.append(event)
return combined_events
def _UpdateTraceEventForMerge(event, process_map, filename, index,
type_name_event_by_pid):
pid, tid = process_map.MapEntry(source=filename,
path=(event['pid'], event['tid']))
event['pid'], event['tid'] = pid, tid
if event['ph'] == METADATA_PHASE:
# Update IDs in 'stackFrames' and 'typeNames' metadata events.
if event['name'] == 'stackFrames':
_UpdateDictIds(index, event['args'], 'stackFrames')
for frame in event['args']['stackFrames'].values():
_UpdateFieldId(index, frame, 'parent')
elif event['name'] == 'typeNames':
_UpdateDictIds(index, event['args'], 'typeNames')
existing_type_name_event = type_name_event_by_pid.get(pid)
if existing_type_name_event is None:
type_name_event_by_pid[pid] = event
else:
existing_type_name_event['args']['typeNames'].update(
event['args']['typeNames'])
# Don't add the event to the merged trace because it has been merged
# into an existing 'typeNames' metadata event for the given process.
return False
elif event['ph'] == MEMORY_DUMP_PHASE:
# Update stack frame and type name IDs in heap dump entries in process
# memory dumps.
for heap_dump in event['args']['dumps'].get('heaps', {}).values():
for heap_entry in heap_dump['entries']:
_UpdateFieldId(index, heap_entry, 'bt', ignored_values=[''])
_UpdateFieldId(index, heap_entry, 'type')
return True # Events should be added to the merged trace by default.
def _ConvertId(index, original_id):
return '%d#%s' % (index, original_id)
def _UpdateDictIds(index, parent_dict, key):
parent_dict[key] = {
_ConvertId(index, original_id): value
for original_id, value in parent_dict[key].items()}
def _UpdateFieldId(index, parent_dict, key, ignored_values=()):
original_value = parent_dict.get(key)
if original_value is not None and original_value not in ignored_values:
parent_dict[key] = _ConvertId(index, original_value)
def _BuildInjectedTraceMarkerEvents(timestamp_range_by_filename, process_map):
logging.info('Building injected trace marker events...')
injected_pid = process_map.max_mapped_id + 1
# Inject a mock process with a thread.
injected_events = [
{
'pid': injected_pid,
'tid': 0,
'ph': METADATA_PHASE,
'ts': 0,
'name': 'process_sort_index',
'args': {'sort_index': -1000} # Show the process at the top.
},
{
'pid': injected_pid,
'tid': 0,
'ph': METADATA_PHASE,
'ts': 0,
'name': 'process_name',
'args': {'name': 'Merged traces'}
},
{
'pid': injected_pid,
'tid': 0,
'ph': METADATA_PHASE,
'ts': 0,
'name': 'thread_name',
'args': {'name': 'Trace'}
}
]
# Inject slices for each sub-trace denoting its beginning and end.
for index, (filename, timestamp_range) in enumerate(
list(timestamp_range_by_filename.items()), 1):
if timestamp_range is None:
continue
min_timestamp, max_timestamp = timestamp_range
name = 'Trace %r (%d/%d)' % (filename, index,
len(timestamp_range_by_filename))
slice_id = 'INJECTED_TRACE_MARKER_%d' % index
injected_events.extend([
{
'pid': injected_pid,
'tid': 0,
'ph': BEGIN_PHASE,
'cat': 'injected',
'name': name,
'id': slice_id,
'ts': min_timestamp
},
{
'pid': injected_pid,
'tid': 0,
'ph': END_PHASE,
'cat': 'injected',
'name': name,
'id': slice_id,
'ts': max_timestamp
}
])
return injected_events
def MergeGenericTraceComponents(component_name, components_by_filename):
"""Merge a generic component of multiple JSON traces into a single component.
This function is only used for components that don't have a component-specific
merging function (see MergeTraceEvents). It just returns the component's first
provided value (in some trace).
"""
components = list(components_by_filename.values())
first_component = next(components)
if not all(c == first_component for c in components):
logging.warning(
'Values of trace component %r differ across the provided traces. '
'The first defined value of the component will be used.',
component_name)
return first_component
def Main(argv):
parser = argparse.ArgumentParser(description='Merge multiple traces.',
add_help=False)
parser.add_argument('input_traces', metavar='INPUT_TRACE', nargs='+',
help='Input trace filename.')
parser.add_argument('-h', '--help', action='help',
help='Show this help message and exit.')
parser.add_argument('-o', '--output_trace', help='Output trace filename. If '
'not provided, the merged trace will be written to '
'the standard output.')
parser.add_argument('-v', '--verbose', action='count', dest='verbosity',
help='Increase verbosity level.')
args = parser.parse_args(argv[1:])
# Set verbosity level.
if args.verbosity >= 2:
logging_level = logging.DEBUG
elif args.verbosity == 1:
logging_level = logging.INFO
else:
logging_level = logging.WARNING
logging.getLogger().setLevel(logging_level)
try:
MergeTraceFiles(args.input_traces, args.output_trace)
return 0
except Exception: # pylint: disable=broad-except
logging.exception('Something went wrong:')
return 1
finally:
logging.warning('This is an EXPERIMENTAL TOOL! If you encounter any '
'issues, please file a Catapult bug '
'(https://github.com/catapult-project/catapult/issues/new) '
'with your current Catapult commit hash, a description of '
'the problem and any error messages, attach the input '
'traces and notify petrcermak@chromium.org. Thank you!')