| # Copyright 2013 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| """Extract histogram names from the description XML file. |
| |
| For more information on the format of the XML file, which is self-documenting, |
| see histograms.xml; however, here is a simple example to get you started. The |
| XML below will generate the following five histograms: |
| |
| HistogramTime |
| HistogramEnum |
| HistogramEnum_Chrome |
| HistogramEnum_IE |
| HistogramEnum_Firefox |
| |
| <histogram-configuration> |
| |
| <histograms> |
| |
| <histogram name="HistogramTime" units="milliseconds"> |
| <owner>person@chromium.org</owner> |
| <owner>some-team@chromium.org</owner> |
| <summary>A brief description.</summary> |
| <details>This is a more thorough description of this histogram.</details> |
| </histogram> |
| |
| <histogram name="HistogramEnum" enum="MyEnumType"> |
| <owner>person@chromium.org</owner> |
| <summary>This histogram sports an enum value type.</summary> |
| </histogram> |
| |
| </histograms> |
| |
| <enums> |
| |
| <enum name="MyEnumType"> |
| <summary>This is an example enum type, where the values mean little.</summary> |
| <int value="1" label="FIRST_VALUE">This is the first value.</int> |
| <int value="2" label="SECOND_VALUE">This is the second value.</int> |
| </enum> |
| |
| </enums> |
| |
| <histogram_suffixes_list> |
| |
| <histogram_suffixes name="BrowserType" separator="_"> |
| <suffix name="Chrome"/> |
| <suffix name="IE"/> |
| <suffix name="Firefox"/> |
| <affected-histogram name="HistogramEnum"/> |
| </histogram_suffixes> |
| |
| </histogram_suffixes_list> |
| |
| </histogram-configuration> |
| """ |
| |
| import copy |
| import datetime |
| import HTMLParser |
| import logging |
| import re |
| import xml.dom.minidom |
| |
| BASIC_EMAIL_REGEXP = r'^[\w\-\+\%\.]+\@[\w\-\+\%\.]+$' |
| |
| OWNER_PLACEHOLDER = ( |
| 'Please list the metric\'s owners. Add more owner tags as needed.') |
| |
| MAX_HISTOGRAM_SUFFIX_DEPENDENCY_DEPTH = 5 |
| |
| DEFAULT_BASE_HISTOGRAM_OBSOLETE_REASON = ( |
| 'Base histogram. Use suffixes of this histogram instead.') |
| |
| EXPIRY_DATE_PATTERN = "%Y-%m-%d" |
| EXPIRY_MILESTONE_RE = re.compile(r'M[0-9]{2,3}\Z') |
| |
| |
| class Error(Exception): |
| pass |
| |
| |
| def _GetTextFromChildNodes(node): |
| """Returns a string concatenation of the text of the given node's children. |
| |
| Comments are ignored, consecutive lines of text are joined with a single |
| space, and paragraphs are maintained so that long text is more readable on |
| dashboards. |
| |
| Args: |
| node: The DOM Element whose children's text is to be extracted, processed, |
| and returned. |
| """ |
| paragraph_break = '\n\n' |
| text_parts = [] |
| |
| for child in node.childNodes: |
| if child.nodeType != xml.dom.minidom.Node.COMMENT_NODE: |
| child_text = child.toxml() |
| if not child_text: |
| continue |
| |
| # If the given node has the below XML representation, then the text |
| # added to the list is 'Some words.\n\nWords.' |
| # <tag> |
| # Some |
| # words. |
| # |
| # <!--Child comment node.--> |
| # |
| # Words. |
| # </tag> |
| |
| # In the case of the first child text node, raw_paragraphs would store |
| # ['\n Some\n words.', ' '], and in the case of the second, |
| # raw_paragraphs would store ['', ' Words.\n']. |
| raw_paragraphs = child_text.split(paragraph_break) |
| |
| # In the case of the first child text node, processed_paragraphs would |
| # store ['Some words.', ''], and in the case of the second, |
| # processed_paragraphs would store ['Words.']. |
| processed_paragraphs = [NormalizeString(text) |
| for text in raw_paragraphs |
| if text] |
| text_parts.append(paragraph_break.join(processed_paragraphs)) |
| |
| return ''.join(text_parts).strip() |
| |
| |
| def NormalizeString(text): |
| r"""Replaces all white space sequences with a single space. |
| |
| Also, unescapes any HTML escaped characters, e.g. " or >. |
| |
| Args: |
| text: The string to normalize, '\n\n a \n b>c '. |
| |
| Returns: |
| The normalized string 'a b>c'. |
| """ |
| line = ' '.join(text.split()) |
| |
| # Unescape using default ASCII encoding. Unescapes any HTML escaped character |
| # like " etc. |
| return HTMLParser.HTMLParser().unescape(line) |
| |
| |
| def _NormalizeAllAttributeValues(node): |
| """Recursively normalizes all tag attribute values in the given tree. |
| |
| Args: |
| node: The minidom node to be normalized. |
| |
| Returns: |
| The normalized minidom node. |
| """ |
| if node.nodeType == xml.dom.minidom.Node.ELEMENT_NODE: |
| for a in node.attributes.keys(): |
| node.attributes[a].value = NormalizeString(node.attributes[a].value) |
| |
| for c in node.childNodes: |
| _NormalizeAllAttributeValues(c) |
| return node |
| |
| |
| def _ExpandHistogramNameWithSuffixes(suffix_name, histogram_name, |
| histogram_suffixes_node): |
| """Creates a new histogram name based on a histogram suffix. |
| |
| Args: |
| suffix_name: The suffix string to apply to the histogram name. May be empty. |
| histogram_name: The name of the histogram. May be of the form |
| Group.BaseName or BaseName. |
| histogram_suffixes_node: The histogram_suffixes XML node. |
| |
| Returns: |
| A string with the expanded histogram name. |
| |
| Raises: |
| Error: if the expansion can't be done. |
| """ |
| if histogram_suffixes_node.hasAttribute('separator'): |
| separator = histogram_suffixes_node.getAttribute('separator') |
| else: |
| separator = '_' |
| |
| if histogram_suffixes_node.hasAttribute('ordering'): |
| ordering = histogram_suffixes_node.getAttribute('ordering') |
| else: |
| ordering = 'suffix' |
| parts = ordering.split(',') |
| ordering = parts[0] |
| if len(parts) > 1: |
| placement = int(parts[1]) |
| else: |
| placement = 1 |
| if ordering not in ['prefix', 'suffix']: |
| logging.error('ordering needs to be prefix or suffix, value is %s', |
| ordering) |
| raise Error() |
| |
| if not suffix_name: |
| return histogram_name |
| |
| if ordering == 'suffix': |
| return histogram_name + separator + suffix_name |
| |
| # For prefixes, the suffix_name is inserted between the "cluster" and the |
| # "remainder", e.g. Foo.BarHist expanded with gamma becomes Foo.gamma_BarHist. |
| sections = histogram_name.split('.') |
| if len(sections) <= placement: |
| logging.error( |
| 'Prefix histogram_suffixes expansions require histogram names which ' |
| 'include a dot separator. Histogram name is %s, histogram_suffixes is ' |
| '%s, and placment is %d', histogram_name, |
| histogram_suffixes_node.getAttribute('name'), placement) |
| raise Error() |
| |
| cluster = '.'.join(sections[0:placement]) + '.' |
| remainder = '.'.join(sections[placement:]) |
| return cluster + suffix_name + separator + remainder |
| |
| |
| def ExtractEnumsFromXmlTree(tree): |
| """Extracts all <enum> nodes in the tree into a dictionary.""" |
| |
| enums = {} |
| have_errors = False |
| |
| last_name = None |
| for enum in tree.getElementsByTagName('enum'): |
| name = enum.getAttribute('name') |
| if last_name is not None and name.lower() < last_name.lower(): |
| logging.error('Enums %s and %s are not in alphabetical order', last_name, |
| name) |
| have_errors = True |
| last_name = name |
| |
| if name in enums: |
| logging.error('Duplicate enum %s', name) |
| have_errors = True |
| continue |
| |
| enum_dict = {} |
| enum_dict['name'] = name |
| enum_dict['values'] = {} |
| |
| for int_tag in enum.getElementsByTagName('int'): |
| value_dict = {} |
| int_value = int(int_tag.getAttribute('value')) |
| if int_value in enum_dict['values']: |
| logging.error('Duplicate enum value %d for enum %s', int_value, name) |
| have_errors = True |
| continue |
| value_dict['label'] = int_tag.getAttribute('label') |
| value_dict['summary'] = _GetTextFromChildNodes(int_tag) |
| enum_dict['values'][int_value] = value_dict |
| |
| enum_int_values = sorted(enum_dict['values'].keys()) |
| |
| last_int_value = None |
| for int_tag in enum.getElementsByTagName('int'): |
| int_value = int(int_tag.getAttribute('value')) |
| if last_int_value is not None and int_value < last_int_value: |
| logging.error('Enum %s int values %d and %d are not in numerical order', |
| name, last_int_value, int_value) |
| have_errors = True |
| left_item_index = bisect.bisect_left(enum_int_values, int_value) |
| if left_item_index == 0: |
| logging.warning('Insert value %d at the beginning', int_value) |
| else: |
| left_int_value = enum_int_values[left_item_index - 1] |
| left_label = enum_dict['values'][left_int_value]['label'] |
| logging.warning('Insert value %d after %d ("%s")', int_value, |
| left_int_value, left_label) |
| else: |
| last_int_value = int_value |
| |
| summary_nodes = enum.getElementsByTagName('summary') |
| if summary_nodes: |
| enum_dict['summary'] = _GetTextFromChildNodes(summary_nodes[0]) |
| |
| enums[name] = enum_dict |
| |
| return enums, have_errors |
| |
| |
| def _ExtractOwners(histogram): |
| """Extracts owners information from the given histogram element. |
| |
| Args: |
| histogram: A DOM Element corresponding to a histogram. |
| |
| Returns: |
| A tuple of owner-related info, e.g. (['alice@chromium.org'], True) |
| |
| The first element is a list of the owners' email addresses, excluding the |
| owner placeholder string. The second element is a boolean indicating |
| whether the histogram has an owner. A histogram whose owner is the owner |
| placeholder string has an owner. |
| """ |
| email_pattern = re.compile(BASIC_EMAIL_REGEXP) |
| owners = [] |
| has_owner = False |
| |
| for owner_node in histogram.getElementsByTagName('owner'): |
| owner_text = _GetTextFromChildNodes(owner_node) |
| is_email = email_pattern.match(owner_text) |
| |
| if owner_text and (is_email or OWNER_PLACEHOLDER in owner_text): |
| has_owner = True |
| if is_email: |
| owners.append(owner_text) |
| |
| return owners, has_owner |
| |
| |
| def _ValidateDateString(date_str): |
| """Checks if |date_str| matches 'YYYY-MM-DD'. |
| |
| Args: |
| date_str: string |
| |
| Returns: |
| True iff |date_str| matches 'YYYY-MM-DD' format. |
| """ |
| try: |
| _ = datetime.datetime.strptime(date_str, EXPIRY_DATE_PATTERN).date() |
| except ValueError: |
| return False |
| return True |
| |
| def _ValidateMilestoneString(milestone_str): |
| """Check if |milestone_str| matches 'M*'.""" |
| return EXPIRY_MILESTONE_RE.match(milestone_str) is not None |
| |
| def _ProcessBaseHistogramAttribute(node, histogram_entry): |
| if node.hasAttribute('base'): |
| is_base = node.getAttribute('base').lower() == 'true' |
| histogram_entry['base'] = is_base |
| if is_base and 'obsolete' not in histogram_entry: |
| histogram_entry['obsolete'] = DEFAULT_BASE_HISTOGRAM_OBSOLETE_REASON |
| |
| |
| def _ExtractHistogramsFromXmlTree(tree, enums): |
| """Extracts all <histogram> nodes in the tree into a dictionary.""" |
| |
| # Process the histograms. The descriptions can include HTML tags. |
| histograms = {} |
| have_errors = False |
| last_name = None |
| for histogram in tree.getElementsByTagName('histogram'): |
| name = histogram.getAttribute('name') |
| if last_name is not None and name.lower() < last_name.lower(): |
| logging.error('Histograms %s and %s are not in alphabetical order', |
| last_name, name) |
| have_errors = True |
| last_name = name |
| if name in histograms: |
| logging.error('Duplicate histogram definition %s', name) |
| have_errors = True |
| continue |
| histograms[name] = histogram_entry = {} |
| |
| # Handle expiry attribute. |
| if histogram.hasAttribute('expires_after'): |
| expiry_str = histogram.getAttribute('expires_after') |
| if (expiry_str == "never" or _ValidateMilestoneString(expiry_str) or |
| _ValidateDateString(expiry_str)): |
| histogram_entry['expires_after'] = expiry_str |
| else: |
| logging.error( |
| 'Expiry of histogram %s does not match expected date format ("%s"),' |
| ' milestone format (M*), or "never": found %s.', name, |
| EXPIRY_DATE_PATTERN, expiry_str) |
| have_errors = True |
| |
| # Find <owner> tag. |
| owners, hasOwner = _ExtractOwners(histogram) |
| if owners: |
| histogram_entry['owners'] = owners |
| |
| # Find <summary> tag. |
| summary_nodes = histogram.getElementsByTagName('summary') |
| |
| if summary_nodes: |
| histogram_entry['summary'] = _GetTextFromChildNodes(summary_nodes[0]) |
| else: |
| histogram_entry['summary'] = 'TBD' |
| |
| # Find <obsolete> tag. |
| obsolete_nodes = histogram.getElementsByTagName('obsolete') |
| if obsolete_nodes: |
| reason = _GetTextFromChildNodes(obsolete_nodes[0]) |
| histogram_entry['obsolete'] = reason |
| |
| # Non-obsolete histograms should provide a <summary>. |
| if not obsolete_nodes and not summary_nodes: |
| logging.error('histogram %s should provide a <summary>', name) |
| have_errors = True |
| |
| # Non-obsolete histograms should specify <owner>s. |
| if not obsolete_nodes and not hasOwner: |
| logging.error('histogram %s should specify <owner>s', name) |
| have_errors = True |
| |
| # Handle units. |
| if histogram.hasAttribute('units'): |
| histogram_entry['units'] = histogram.getAttribute('units') |
| |
| # Find <details> tag. |
| details_nodes = histogram.getElementsByTagName('details') |
| if details_nodes: |
| histogram_entry['details'] = _GetTextFromChildNodes(details_nodes[0]) |
| |
| # Handle enum types. |
| if histogram.hasAttribute('enum'): |
| enum_name = histogram.getAttribute('enum') |
| if enum_name not in enums: |
| logging.error('Unknown enum %s in histogram %s', enum_name, name) |
| have_errors = True |
| else: |
| histogram_entry['enum'] = enums[enum_name] |
| |
| _ProcessBaseHistogramAttribute(histogram, histogram_entry) |
| |
| return histograms, have_errors |
| |
| |
| def _GetObsoleteReason(node): |
| """If the node's histogram is obsolete, returns a string explanation. |
| |
| Otherwise, returns None. |
| |
| Args: |
| node: A DOM Element associated with a histogram. |
| """ |
| for child in node.childNodes: |
| if child.localName == 'obsolete': |
| # There can be at most 1 obsolete element per node. |
| return _GetTextFromChildNodes(child) |
| return None |
| |
| |
| def _UpdateHistogramsWithSuffixes(tree, histograms): |
| """Processes <histogram_suffixes> tags and combines with affected histograms. |
| |
| The histograms dictionary will be updated in-place by adding new histograms |
| created by combining histograms themselves with histogram_suffixes targeting |
| these histograms. |
| |
| Args: |
| tree: XML dom tree. |
| histograms: a dictionary of histograms previously extracted from the tree; |
| |
| Returns: |
| True if any errors were found. |
| """ |
| have_errors = False |
| |
| histogram_suffix_tag = 'histogram_suffixes' |
| suffix_tag = 'suffix' |
| with_tag = 'with-suffix' |
| |
| # Verify order of histogram_suffixes fields first. |
| last_name = None |
| for histogram_suffixes in tree.getElementsByTagName(histogram_suffix_tag): |
| name = histogram_suffixes.getAttribute('name') |
| if last_name is not None and name.lower() < last_name.lower(): |
| logging.error('histogram_suffixes %s and %s are not in alphabetical ' |
| 'order', last_name, name) |
| have_errors = True |
| last_name = name |
| |
| # histogram_suffixes can depend on other histogram_suffixes, so we need to be |
| # careful. Make a temporary copy of the list of histogram_suffixes to use as a |
| # queue. histogram_suffixes whose dependencies have not yet been processed |
| # will get relegated to the back of the queue to be processed later. |
| reprocess_queue = [] |
| |
| def GenerateHistogramSuffixes(): |
| for f in tree.getElementsByTagName(histogram_suffix_tag): |
| yield 0, f |
| for r, f in reprocess_queue: |
| yield r, f |
| |
| for reprocess_count, histogram_suffixes in GenerateHistogramSuffixes(): |
| # Check dependencies first |
| dependencies_valid = True |
| affected_histograms = histogram_suffixes.getElementsByTagName( |
| 'affected-histogram') |
| for affected_histogram in affected_histograms: |
| histogram_name = affected_histogram.getAttribute('name') |
| if histogram_name not in histograms: |
| # Base histogram is missing |
| dependencies_valid = False |
| missing_dependency = histogram_name |
| break |
| if not dependencies_valid: |
| if reprocess_count < MAX_HISTOGRAM_SUFFIX_DEPENDENCY_DEPTH: |
| reprocess_queue.append((reprocess_count + 1, histogram_suffixes)) |
| continue |
| else: |
| logging.error('histogram_suffixes %s is missing its dependency %s', |
| histogram_suffixes.getAttribute('name'), |
| missing_dependency) |
| have_errors = True |
| continue |
| |
| # If the suffix group has an obsolete tag, all suffixes it generates inherit |
| # its reason. |
| group_obsolete_reason = _GetObsoleteReason(histogram_suffixes) |
| |
| name = histogram_suffixes.getAttribute('name') |
| suffix_nodes = histogram_suffixes.getElementsByTagName(suffix_tag) |
| suffix_labels = {} |
| for suffix in suffix_nodes: |
| suffix_labels[suffix.getAttribute('name')] = suffix.getAttribute('label') |
| # Find owners list under current histogram_suffixes tag. |
| owners, _ = _ExtractOwners(histogram_suffixes) |
| |
| last_histogram_name = None |
| for affected_histogram in affected_histograms: |
| histogram_name = affected_histogram.getAttribute('name') |
| if (last_histogram_name is not None and |
| histogram_name.lower() < last_histogram_name.lower()): |
| logging.error('Affected histograms %s and %s of histogram_suffixes %s ' |
| 'are not in alphabetical order', last_histogram_name, |
| histogram_name, name) |
| have_errors = True |
| last_histogram_name = histogram_name |
| with_suffixes = affected_histogram.getElementsByTagName(with_tag) |
| if with_suffixes: |
| suffixes_to_add = with_suffixes |
| else: |
| suffixes_to_add = suffix_nodes |
| for suffix in suffixes_to_add: |
| suffix_name = suffix.getAttribute('name') |
| try: |
| new_histogram_name = _ExpandHistogramNameWithSuffixes( |
| suffix_name, histogram_name, histogram_suffixes) |
| if new_histogram_name != histogram_name: |
| new_histogram = copy.deepcopy(histograms[histogram_name]) |
| # Do not copy forward base histogram state to suffixed |
| # histograms. Any suffixed histograms that wish to remain base |
| # histograms must explicitly re-declare themselves as base |
| # histograms. |
| if new_histogram.get('base', False): |
| del new_histogram['base'] |
| if (new_histogram.get( |
| 'obsolete', '') == DEFAULT_BASE_HISTOGRAM_OBSOLETE_REASON): |
| del new_histogram['obsolete'] |
| histograms[new_histogram_name] = new_histogram |
| |
| suffix_label = suffix_labels.get(suffix_name, '') |
| |
| # TODO(yiyaoliu): Rename these to be consistent with the new naming. |
| # It is kept unchanged for now to be it's used by dashboards. |
| if 'fieldtrial_groups' not in histograms[new_histogram_name]: |
| histograms[new_histogram_name]['fieldtrial_groups'] = [] |
| histograms[new_histogram_name]['fieldtrial_groups'].append( |
| suffix_name) |
| |
| if 'fieldtrial_names' not in histograms[new_histogram_name]: |
| histograms[new_histogram_name]['fieldtrial_names'] = [] |
| histograms[new_histogram_name]['fieldtrial_names'].append(name) |
| |
| if 'fieldtrial_labels' not in histograms[new_histogram_name]: |
| histograms[new_histogram_name]['fieldtrial_labels'] = [] |
| histograms[new_histogram_name]['fieldtrial_labels'].append( |
| suffix_label) |
| |
| # If no owners are added for this histogram-suffixes, it inherits the |
| # owners of its parents. |
| if owners: |
| histograms[new_histogram_name]['owners'] = owners |
| |
| # If a suffix has an obsolete node, it's marked as obsolete for the |
| # specified reason, overwriting its group's obsoletion reason if the |
| # group itself was obsolete as well. |
| obsolete_reason = _GetObsoleteReason(suffix) |
| if not obsolete_reason: |
| obsolete_reason = group_obsolete_reason |
| |
| # If the suffix has an obsolete tag, all histograms it generates |
| # inherit it. |
| if obsolete_reason: |
| histograms[new_histogram_name]['obsolete'] = obsolete_reason |
| |
| _ProcessBaseHistogramAttribute(suffix, histograms[new_histogram_name]) |
| |
| except Error: |
| have_errors = True |
| |
| return have_errors |
| |
| |
| def ExtractHistogramsFromDom(tree): |
| """Computes the histogram names and descriptions from the XML representation. |
| |
| Args: |
| tree: A DOM tree of XML content. |
| |
| Returns: |
| a tuple of (histograms, status) where histograms is a dictionary mapping |
| histogram names to dictionaries containing histogram descriptions and status |
| is a boolean indicating if errros were encoutered in processing. |
| """ |
| _NormalizeAllAttributeValues(tree) |
| |
| enums, enum_errors = ExtractEnumsFromXmlTree(tree) |
| histograms, histogram_errors = _ExtractHistogramsFromXmlTree(tree, enums) |
| update_errors = _UpdateHistogramsWithSuffixes(tree, histograms) |
| |
| return histograms, enum_errors or histogram_errors or update_errors |
| |
| |
| def ExtractHistograms(filename): |
| """Loads histogram definitions from a disk file. |
| |
| Args: |
| filename: a file path to load data from. |
| |
| Returns: |
| a dictionary of histogram descriptions. |
| |
| Raises: |
| Error: if the file is not well-formatted. |
| """ |
| with open(filename, 'r') as f: |
| tree = xml.dom.minidom.parse(f) |
| histograms, had_errors = ExtractHistogramsFromDom(tree) |
| if had_errors: |
| logging.error('Error parsing %s', filename) |
| raise Error() |
| return histograms |
| |
| |
| def ExtractNames(histograms): |
| return sorted(histograms.keys()) |