|  | # Copyright (c) 2014 The Chromium Authors. All rights reserved. | 
|  | # Use of this source code is governed by a BSD-style license that can be | 
|  | # found in the LICENSE file. | 
|  |  | 
|  | import base64 | 
|  | import xml.dom.minidom as minidom | 
|  | from xml.parsers.expat import ExpatError | 
|  |  | 
|  | import crash_utils | 
|  | from repository_parser_interface import ParserInterface | 
|  |  | 
|  | FILE_CHANGE_TYPE_MAP = { | 
|  | 'add': 'A', | 
|  | 'copy': 'C', | 
|  | 'delete': 'D', | 
|  | 'modify': 'M', | 
|  | 'rename': 'R' | 
|  | } | 
|  |  | 
|  |  | 
|  | def _ConvertToFileChangeType(file_action): | 
|  | # TODO(stgao): verify impact on code that checks the file change type. | 
|  | return file_action[0].upper() | 
|  |  | 
|  |  | 
|  | class GitParser(ParserInterface): | 
|  | """Parser for Git repository in googlesource. | 
|  |  | 
|  | Attributes: | 
|  | parsed_deps: A map from component path to its repository name, regression, | 
|  | etc. | 
|  | url_parts_map: A map from url type to its url parts. This parts are added | 
|  | the base url to form different urls. | 
|  | """ | 
|  |  | 
|  | def __init__(self, parsed_deps, url_parts_map): | 
|  | self.component_to_url_map = parsed_deps | 
|  | self.url_parts_map = url_parts_map | 
|  |  | 
|  | def ParseChangelog(self, component_path, range_start, range_end): | 
|  | file_to_revision_map = {} | 
|  | revision_map = {} | 
|  | base_url = self.component_to_url_map[component_path]['repository'] | 
|  | changelog_url = base_url + self.url_parts_map['changelog_url'] | 
|  | revision_url = base_url + self.url_parts_map['revision_url'] | 
|  |  | 
|  | # Retrieve data from the url, return empty maps if fails. Html url is a\ | 
|  | # url where the changelog can be parsed from html. | 
|  | url = changelog_url % (range_start, range_end) | 
|  | html_url = url + '?pretty=fuller' | 
|  | response = crash_utils.GetDataFromURL(html_url) | 
|  | if not response: | 
|  | return (revision_map, file_to_revision_map) | 
|  |  | 
|  | # Parse xml out of the returned string. If it failes, Try parsing | 
|  | # from JSON objects. | 
|  | try: | 
|  | dom = minidom.parseString(response) | 
|  | except ExpatError: | 
|  | self.ParseChangelogFromJSON(range_start, range_end, changelog_url, | 
|  | revision_url, revision_map, | 
|  | file_to_revision_map) | 
|  | return (revision_map, file_to_revision_map) | 
|  |  | 
|  | # The revisions information are in from the third divs to the second | 
|  | # to last one. | 
|  | divs = dom.getElementsByTagName('div')[2:-1] | 
|  | pres = dom.getElementsByTagName('pre') | 
|  | uls = dom.getElementsByTagName('ul') | 
|  |  | 
|  | # Divs, pres and uls each contain revision information for one CL, so | 
|  | # they should have same length. | 
|  | if not divs or len(divs) != len(pres) or len(pres) != len(uls): | 
|  | self.ParseChangelogFromJSON(range_start, range_end, changelog_url, | 
|  | revision_url, revision_map, | 
|  | file_to_revision_map) | 
|  | return (revision_map, file_to_revision_map) | 
|  |  | 
|  | # Iterate through divs and parse revisions | 
|  | for (div, pre, ul) in zip(divs, pres, uls): | 
|  | # Create new revision object for each revision. | 
|  | revision = {} | 
|  |  | 
|  | # There must be three <tr>s. If not, this page is wrong. | 
|  | trs = div.getElementsByTagName('tr') | 
|  | if len(trs) != 3: | 
|  | continue | 
|  |  | 
|  | # Retrieve git hash. | 
|  | githash = trs[0].getElementsByTagName('a')[0].firstChild.nodeValue | 
|  |  | 
|  | # Retrieve and set author. | 
|  | author = trs[1].getElementsByTagName( | 
|  | 'td')[0].firstChild.nodeValue.split('<')[0] | 
|  | revision['author'] = author | 
|  | revision['time'] = trs[1].getElementsByTagName( | 
|  | 'td')[1].firstChild.nodeValue | 
|  |  | 
|  | # Retrive and set message. | 
|  | revision['message'] = pre.firstChild.nodeValue | 
|  |  | 
|  | # Set url of this CL. | 
|  | revision_url_part = self.url_parts_map['revision_url'] % githash | 
|  | revision['url'] = base_url + revision_url_part | 
|  |  | 
|  | # Go through changed files, they are in li. | 
|  | lis = ul.getElementsByTagName('li') | 
|  | for li in lis: | 
|  | # Retrieve path and action of the changed file | 
|  | file_path = li.getElementsByTagName('a')[0].firstChild.nodeValue | 
|  | file_change_type = li.getElementsByTagName('span')[ | 
|  | 0].getAttribute('class') | 
|  |  | 
|  | # Normalize file action so that it is same as SVN parser. | 
|  | file_change_type = _ConvertToFileChangeType(file_change_type) | 
|  |  | 
|  | # Add the changed file to the map. | 
|  | if file_path not in file_to_revision_map: | 
|  | file_to_revision_map[file_path] = [] | 
|  | file_to_revision_map[file_path].append((githash, file_change_type)) | 
|  |  | 
|  | # Add this revision object to the map. | 
|  | revision_map[githash] = revision | 
|  |  | 
|  | # Parse one revision for the start range, because googlesource does not | 
|  | # include the start of the range. | 
|  | self.ParseRevision(revision_url, range_start, revision_map, | 
|  | file_to_revision_map) | 
|  |  | 
|  | return (revision_map, file_to_revision_map) | 
|  |  | 
|  | def ParseChangelogFromJSON(self, range_start, range_end, changelog_url, | 
|  | revision_url, revision_map, file_to_revision_map): | 
|  | """Parses changelog by going over the JSON file. | 
|  |  | 
|  | Args: | 
|  | range_start: Starting range of the regression. | 
|  | range_end: Ending range of the regression. | 
|  | changelog_url: The url to retrieve changelog from. | 
|  | revision_url: The url to retrieve individual revision from. | 
|  | revision_map: A map from a git hash number to its revision information. | 
|  | file_to_revision_map: A map from file to a git hash in which it occurs. | 
|  | """ | 
|  | # Compute URLs from given range, and retrieves changelog. Stop if it fails. | 
|  | changelog_url %= (range_start, range_end) | 
|  | json_url = changelog_url + '?format=json' | 
|  | response = crash_utils.GetDataFromURL(json_url) | 
|  | if not response: | 
|  | return | 
|  |  | 
|  | # Parse changelog from the returned object. The returned string should | 
|  | # start with ")}]'\n", so start from the 6th character. | 
|  | revisions = crash_utils.LoadJSON(response[5:]) | 
|  | if not revisions: | 
|  | return | 
|  |  | 
|  | # Parse individual revision in the log. | 
|  | for revision in revisions['log']: | 
|  | githash = revision['commit'] | 
|  | self.ParseRevision(revision_url, githash, revision_map, | 
|  | file_to_revision_map) | 
|  |  | 
|  | # Parse the revision with range_start, because googlesource ignores | 
|  | # that one. | 
|  | self.ParseRevision(revision_url, range_start, revision_map, | 
|  | file_to_revision_map) | 
|  |  | 
|  | def ParseRevision(self, revision_url, githash, revision_map, | 
|  | file_to_revision_map): | 
|  |  | 
|  | # Retrieve data from the URL, return if it fails. | 
|  | url = revision_url % githash | 
|  | response = crash_utils.GetDataFromURL(url + '?format=json') | 
|  | if not response: | 
|  | return | 
|  |  | 
|  | # Load JSON object from the string. If it fails, terminate the function. | 
|  | json_revision = crash_utils.LoadJSON(response[5:]) | 
|  | if not json_revision: | 
|  | return | 
|  |  | 
|  | # Create a map representing object and get githash from the JSON object. | 
|  | revision = {} | 
|  | githash = json_revision['commit'] | 
|  |  | 
|  | # Set author, message and URL of this CL. | 
|  | revision['author'] = json_revision['author']['name'] | 
|  | revision['time'] = json_revision['author']['time'] | 
|  | revision['message'] = json_revision['message'] | 
|  | revision['url'] = url | 
|  |  | 
|  | # Iterate through the changed files. | 
|  | for diff in json_revision['tree_diff']: | 
|  | file_path = diff['new_path'] | 
|  | file_change_type = diff['type'] | 
|  |  | 
|  | # Normalize file action so that it fits with svn_repository_parser. | 
|  | file_change_type = _ConvertToFileChangeType(file_change_type) | 
|  |  | 
|  | # Add the file to the map. | 
|  | if file_path not in file_to_revision_map: | 
|  | file_to_revision_map[file_path] = [] | 
|  | file_to_revision_map[file_path].append((githash, file_change_type)) | 
|  |  | 
|  | # Add this CL to the map. | 
|  | revision_map[githash] = revision | 
|  |  | 
|  | return | 
|  |  | 
|  | def ParseLineDiff(self, path, component, file_change_type, githash): | 
|  | changed_line_numbers = [] | 
|  | changed_line_contents = [] | 
|  | base_url = self.component_to_url_map[component]['repository'] | 
|  | backup_url = (base_url + self.url_parts_map['revision_url']) % githash | 
|  |  | 
|  | # If the file is added (not modified), treat it as if it is not changed. | 
|  | if file_change_type in ('A', 'C', 'R'): | 
|  | # TODO(stgao): Maybe return whole file change for Add, Rename, and Copy? | 
|  | return (backup_url, changed_line_numbers, changed_line_contents) | 
|  |  | 
|  | # Retrieves the diff data from URL, and if it fails, return emptry lines. | 
|  | url = (base_url + self.url_parts_map['diff_url']) % (githash, path) | 
|  | data = crash_utils.GetDataFromURL(url + '?format=text') | 
|  | if not data: | 
|  | return (backup_url, changed_line_numbers, changed_line_contents) | 
|  |  | 
|  | # Decode the returned object to line diff info | 
|  | diff = base64.b64decode(data).splitlines() | 
|  |  | 
|  | # Iterate through the lines in diff. Set current line to -1 so that we know | 
|  | # that current line is part of the diff chunk. | 
|  | current_line = -1 | 
|  | for line in diff: | 
|  | line = line.strip() | 
|  |  | 
|  | # If line starts with @@, a new chunk starts. | 
|  | if line.startswith('@@'): | 
|  | current_line = int(line.split('+')[1].split(',')[0]) | 
|  |  | 
|  | # If we are in a chunk. | 
|  | elif current_line != -1: | 
|  | # If line is either added or modified. | 
|  | if line.startswith('+'): | 
|  | changed_line_numbers.append(current_line) | 
|  | changed_line_contents.append(line[2:]) | 
|  |  | 
|  | # Do not increment current line if the change is 'delete'. | 
|  | if not line.startswith('-'): | 
|  | current_line += 1 | 
|  |  | 
|  | # Return url without '?format=json' | 
|  | return (url, changed_line_numbers, changed_line_contents) | 
|  |  | 
|  | def ParseBlameInfo(self, component, file_path, line, revision): | 
|  | base_url = self.component_to_url_map[component]['repository'] | 
|  |  | 
|  | # Retrieve blame JSON file from googlesource. If it fails, return None. | 
|  | url_part = self.url_parts_map['blame_url'] % (revision, file_path) | 
|  | blame_url = base_url + url_part | 
|  | json_string = crash_utils.GetDataFromURL(blame_url) | 
|  | if not json_string: | 
|  | return | 
|  |  | 
|  | # Parse JSON object from the string. The returned string should | 
|  | # start with ")}]'\n", so start from the 6th character. | 
|  | annotation = crash_utils.LoadJSON(json_string[5:]) | 
|  | if not annotation: | 
|  | return | 
|  |  | 
|  | # Go through the regions, which is a list of consecutive lines with same | 
|  | # author/revision. | 
|  | for blame_line in annotation['regions']: | 
|  | start = blame_line['start'] | 
|  | count = blame_line['count'] | 
|  |  | 
|  | # For each region, check if the line we want the blame info of is in this | 
|  | # region. | 
|  | if start <= line and line <= start + count - 1: | 
|  | # If we are in the right region, get the information from the line. | 
|  | revision = blame_line['commit'] | 
|  | author = blame_line['author']['name'] | 
|  | revision_url_parts = self.url_parts_map['revision_url'] % revision | 
|  | revision_url = base_url + revision_url_parts | 
|  | # TODO(jeun): Add a way to get content from JSON object. | 
|  | content = None | 
|  |  | 
|  | (revision_info, _) = self.ParseChangelog(component, revision, revision) | 
|  | message = revision_info[revision]['message'] | 
|  | time = revision_info[revision]['time'] | 
|  | return (content, revision, author, revision_url, message, time) | 
|  |  | 
|  | # Return none if the region does not exist. | 
|  | return None |