tools/findit/git_repository_parser.py - chromium/src - Git at Google

 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 import base64
 import xml.dom.minidom as minidom
 from xml.parsers.expat import ExpatError

 import crash_utils
 from repository_parser_interface import ParserInterface

 FILE_CHANGE_TYPE_MAP = {
     'add': 'A',
     'copy': 'C',
     'delete': 'D',
     'modify': 'M',
     'rename': 'R'
 }


 def _ConvertToFileChangeType(file_action):
   # TODO(stgao): verify impact on code that checks the file change type.
   return file_action[0].upper()


 class GitParser(ParserInterface):
   """Parser for Git repository in googlesource.

   Attributes:
     parsed_deps: A map from component path to its repository name, regression,
                  etc.
     url_parts_map: A map from url type to its url parts. This parts are added
                    the base url to form different urls.
   """

   def __init__(self, parsed_deps, url_parts_map):
     self.component_to_url_map = parsed_deps
     self.url_parts_map = url_parts_map

   def ParseChangelog(self, component_path, range_start, range_end):
     file_to_revision_map = {}
     revision_map = {}
     base_url = self.component_to_url_map[component_path]['repository']
     changelog_url = base_url + self.url_parts_map['changelog_url']
     revision_url = base_url + self.url_parts_map['revision_url']

     # Retrieve data from the url, return empty maps if fails. Html url is a\
     # url where the changelog can be parsed from html.
     url = changelog_url % (range_start, range_end)
     html_url = url + '?pretty=fuller'
     response = crash_utils.GetDataFromURL(html_url)
     if not response:
       return (revision_map, file_to_revision_map)

     # Parse xml out of the returned string. If it failes, Try parsing
     # from JSON objects.
     try:
       dom = minidom.parseString(response)
     except ExpatError:
       self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
                                   revision_url, revision_map,
                                   file_to_revision_map)
       return (revision_map, file_to_revision_map)

     # The revisions information are in from the third divs to the second
     # to last one.
     divs = dom.getElementsByTagName('div')[2:-1]
     pres = dom.getElementsByTagName('pre')
     uls = dom.getElementsByTagName('ul')

     # Divs, pres and uls each contain revision information for one CL, so
     # they should have same length.
     if not divs or len(divs) != len(pres) or len(pres) != len(uls):
       self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
                                   revision_url, revision_map,
                                   file_to_revision_map)
       return (revision_map, file_to_revision_map)

     # Iterate through divs and parse revisions
     for (div, pre, ul) in zip(divs, pres, uls):
       # Create new revision object for each revision.
       revision = {}

       # There must be three <tr>s. If not, this page is wrong.
       trs = div.getElementsByTagName('tr')
       if len(trs) != 3:
         continue

       # Retrieve git hash.
       githash = trs[0].getElementsByTagName('a')[0].firstChild.nodeValue

       # Retrieve and set author.
       author = trs[1].getElementsByTagName(
           'td')[0].firstChild.nodeValue.split('<')[0]
       revision['author'] = author
       revision['time'] = trs[1].getElementsByTagName(
           'td')[1].firstChild.nodeValue

       # Retrive and set message.
       revision['message'] = pre.firstChild.nodeValue

       # Set url of this CL.
       revision_url_part = self.url_parts_map['revision_url'] % githash
       revision['url'] = base_url + revision_url_part

       # Go through changed files, they are in li.
       lis = ul.getElementsByTagName('li')
       for li in lis:
         # Retrieve path and action of the changed file
         file_path = li.getElementsByTagName('a')[0].firstChild.nodeValue
         file_change_type = li.getElementsByTagName('span')[
             0].getAttribute('class')

         # Normalize file action so that it is same as SVN parser.
         file_change_type = _ConvertToFileChangeType(file_change_type)

         # Add the changed file to the map.
         if file_path not in file_to_revision_map:
           file_to_revision_map[file_path] = []
         file_to_revision_map[file_path].append((githash, file_change_type))

       # Add this revision object to the map.
       revision_map[githash] = revision

     # Parse one revision for the start range, because googlesource does not
     # include the start of the range.
     self.ParseRevision(revision_url, range_start, revision_map,
                        file_to_revision_map)

     return (revision_map, file_to_revision_map)

   def ParseChangelogFromJSON(self, range_start, range_end, changelog_url,
                              revision_url, revision_map, file_to_revision_map):
     """Parses changelog by going over the JSON file.

     Args:
       range_start: Starting range of the regression.
       range_end: Ending range of the regression.
       changelog_url: The url to retrieve changelog from.
       revision_url: The url to retrieve individual revision from.
       revision_map: A map from a git hash number to its revision information.
       file_to_revision_map: A map from file to a git hash in which it occurs.
     """
     # Compute URLs from given range, and retrieves changelog. Stop if it fails.
     changelog_url %= (range_start, range_end)
     json_url = changelog_url + '?format=json'
     response = crash_utils.GetDataFromURL(json_url)
     if not response:
       return

     # Parse changelog from the returned object. The returned string should
     # start with ")}]'\n", so start from the 6th character.
     revisions = crash_utils.LoadJSON(response[5:])
     if not revisions:
       return

     # Parse individual revision in the log.
     for revision in revisions['log']:
       githash = revision['commit']
       self.ParseRevision(revision_url, githash, revision_map,
                          file_to_revision_map)

     # Parse the revision with range_start, because googlesource ignores
     # that one.
     self.ParseRevision(revision_url, range_start, revision_map,
                        file_to_revision_map)

   def ParseRevision(self, revision_url, githash, revision_map,
                     file_to_revision_map):

     # Retrieve data from the URL, return if it fails.
     url = revision_url % githash
     response = crash_utils.GetDataFromURL(url + '?format=json')
     if not response:
       return

     # Load JSON object from the string. If it fails, terminate the function.
     json_revision = crash_utils.LoadJSON(response[5:])
     if not json_revision:
       return

     # Create a map representing object and get githash from the JSON object.
     revision = {}
     githash = json_revision['commit']

     # Set author, message and URL of this CL.
     revision['author'] = json_revision['author']['name']
     revision['time'] = json_revision['author']['time']
     revision['message'] = json_revision['message']
     revision['url'] = url

     # Iterate through the changed files.
     for diff in json_revision['tree_diff']:
       file_path = diff['new_path']
       file_change_type = diff['type']

       # Normalize file action so that it fits with svn_repository_parser.
       file_change_type = _ConvertToFileChangeType(file_change_type)

       # Add the file to the map.
       if file_path not in file_to_revision_map:
         file_to_revision_map[file_path] = []
       file_to_revision_map[file_path].append((githash, file_change_type))

     # Add this CL to the map.
     revision_map[githash] = revision

     return

   def ParseLineDiff(self, path, component, file_change_type, githash):
     changed_line_numbers = []
     changed_line_contents = []
     base_url = self.component_to_url_map[component]['repository']
     backup_url = (base_url + self.url_parts_map['revision_url']) % githash

     # If the file is added (not modified), treat it as if it is not changed.
     if file_change_type in ('A', 'C', 'R'):
       # TODO(stgao): Maybe return whole file change for Add, Rename, and Copy?
       return (backup_url, changed_line_numbers, changed_line_contents)

     # Retrieves the diff data from URL, and if it fails, return emptry lines.
     url = (base_url + self.url_parts_map['diff_url']) % (githash, path)
     data = crash_utils.GetDataFromURL(url + '?format=text')
     if not data:
       return (backup_url, changed_line_numbers, changed_line_contents)

     # Decode the returned object to line diff info
     diff = base64.b64decode(data).splitlines()

     # Iterate through the lines in diff. Set current line to -1 so that we know
     # that current line is part of the diff chunk.
     current_line = -1
     for line in diff:
       line = line.strip()

       # If line starts with @@, a new chunk starts.
       if line.startswith('@@'):
         current_line = int(line.split('+')[1].split(',')[0])

       # If we are in a chunk.
       elif current_line != -1:
         # If line is either added or modified.
         if line.startswith('+'):
           changed_line_numbers.append(current_line)
           changed_line_contents.append(line[2:])

         # Do not increment current line if the change is 'delete'.
         if not line.startswith('-'):
           current_line += 1

     # Return url without '?format=json'
     return (url, changed_line_numbers, changed_line_contents)

   def ParseBlameInfo(self, component, file_path, line, revision):
     base_url = self.component_to_url_map[component]['repository']

     # Retrieve blame JSON file from googlesource. If it fails, return None.
     url_part = self.url_parts_map['blame_url'] % (revision, file_path)
     blame_url = base_url + url_part
     json_string = crash_utils.GetDataFromURL(blame_url)
     if not json_string:
       return

     # Parse JSON object from the string. The returned string should
     # start with ")}]'\n", so start from the 6th character.
     annotation = crash_utils.LoadJSON(json_string[5:])
     if not annotation:
       return

     # Go through the regions, which is a list of consecutive lines with same
     # author/revision.
     for blame_line in annotation['regions']:
       start = blame_line['start']
       count = blame_line['count']

       # For each region, check if the line we want the blame info of is in this
       # region.
       if start <= line and line <= start + count - 1:
         # If we are in the right region, get the information from the line.
         revision = blame_line['commit']
         author = blame_line['author']['name']
         revision_url_parts = self.url_parts_map['revision_url'] % revision
         revision_url = base_url + revision_url_parts
         # TODO(jeun): Add a way to get content from JSON object.
         content = None

         (revision_info, _) = self.ParseChangelog(component, revision, revision)
         message = revision_info[revision]['message']
         time = revision_info[revision]['time']
         return (content, revision, author, revision_url, message, time)

     # Return none if the region does not exist.
     return None
	# Copyright (c) 2014 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	import base64
	import xml.dom.minidom as minidom
	from xml.parsers.expat import ExpatError

	import crash_utils
	from repository_parser_interface import ParserInterface

	FILE_CHANGE_TYPE_MAP = {
	'add': 'A',
	'copy': 'C',
	'delete': 'D',
	'modify': 'M',
	'rename': 'R'
	}


	def _ConvertToFileChangeType(file_action):
	# TODO(stgao): verify impact on code that checks the file change type.
	return file_action[0].upper()


	class GitParser(ParserInterface):
	"""Parser for Git repository in googlesource.

	Attributes:
	parsed_deps: A map from component path to its repository name, regression,
	etc.
	url_parts_map: A map from url type to its url parts. This parts are added
	the base url to form different urls.
	"""

	def __init__(self, parsed_deps, url_parts_map):
	self.component_to_url_map = parsed_deps
	self.url_parts_map = url_parts_map

	def ParseChangelog(self, component_path, range_start, range_end):
	file_to_revision_map = {}
	revision_map = {}
	base_url = self.component_to_url_map[component_path]['repository']
	changelog_url = base_url + self.url_parts_map['changelog_url']
	revision_url = base_url + self.url_parts_map['revision_url']

	# Retrieve data from the url, return empty maps if fails. Html url is a\
	# url where the changelog can be parsed from html.
	url = changelog_url % (range_start, range_end)
	html_url = url + '?pretty=fuller'
	response = crash_utils.GetDataFromURL(html_url)
	if not response:
	return (revision_map, file_to_revision_map)

	# Parse xml out of the returned string. If it failes, Try parsing
	# from JSON objects.
	try:
	dom = minidom.parseString(response)
	except ExpatError:
	self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
	revision_url, revision_map,
	file_to_revision_map)
	return (revision_map, file_to_revision_map)

	# The revisions information are in from the third divs to the second
	# to last one.
	divs = dom.getElementsByTagName('div')[2:-1]
	pres = dom.getElementsByTagName('pre')
	uls = dom.getElementsByTagName('ul')

	# Divs, pres and uls each contain revision information for one CL, so
	# they should have same length.
	if not divs or len(divs) != len(pres) or len(pres) != len(uls):
	self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
	revision_url, revision_map,
	file_to_revision_map)
	return (revision_map, file_to_revision_map)

	# Iterate through divs and parse revisions
	for (div, pre, ul) in zip(divs, pres, uls):
	# Create new revision object for each revision.
	revision = {}

	# There must be three <tr>s. If not, this page is wrong.
	trs = div.getElementsByTagName('tr')
	if len(trs) != 3:
	continue

	# Retrieve git hash.
	githash = trs[0].getElementsByTagName('a')[0].firstChild.nodeValue

	# Retrieve and set author.
	author = trs[1].getElementsByTagName(
	'td')[0].firstChild.nodeValue.split('<')[0]
	revision['author'] = author
	revision['time'] = trs[1].getElementsByTagName(
	'td')[1].firstChild.nodeValue

	# Retrive and set message.
	revision['message'] = pre.firstChild.nodeValue

	# Set url of this CL.
	revision_url_part = self.url_parts_map['revision_url'] % githash
	revision['url'] = base_url + revision_url_part

	# Go through changed files, they are in li.
	lis = ul.getElementsByTagName('li')
	for li in lis:
	# Retrieve path and action of the changed file
	file_path = li.getElementsByTagName('a')[0].firstChild.nodeValue
	file_change_type = li.getElementsByTagName('span')[
	0].getAttribute('class')

	# Normalize file action so that it is same as SVN parser.
	file_change_type = _ConvertToFileChangeType(file_change_type)

	# Add the changed file to the map.
	if file_path not in file_to_revision_map:
	file_to_revision_map[file_path] = []
	file_to_revision_map[file_path].append((githash, file_change_type))

	# Add this revision object to the map.
	revision_map[githash] = revision

	# Parse one revision for the start range, because googlesource does not
	# include the start of the range.
	self.ParseRevision(revision_url, range_start, revision_map,
	file_to_revision_map)

	return (revision_map, file_to_revision_map)

	def ParseChangelogFromJSON(self, range_start, range_end, changelog_url,
	revision_url, revision_map, file_to_revision_map):
	"""Parses changelog by going over the JSON file.

	Args:
	range_start: Starting range of the regression.
	range_end: Ending range of the regression.
	changelog_url: The url to retrieve changelog from.
	revision_url: The url to retrieve individual revision from.
	revision_map: A map from a git hash number to its revision information.
	file_to_revision_map: A map from file to a git hash in which it occurs.
	"""
	# Compute URLs from given range, and retrieves changelog. Stop if it fails.
	changelog_url %= (range_start, range_end)
	json_url = changelog_url + '?format=json'
	response = crash_utils.GetDataFromURL(json_url)
	if not response:
	return

	# Parse changelog from the returned object. The returned string should
	# start with ")}]'\n", so start from the 6th character.
	revisions = crash_utils.LoadJSON(response[5:])
	if not revisions:
	return

	# Parse individual revision in the log.
	for revision in revisions['log']:
	githash = revision['commit']
	self.ParseRevision(revision_url, githash, revision_map,
	file_to_revision_map)

	# Parse the revision with range_start, because googlesource ignores
	# that one.
	self.ParseRevision(revision_url, range_start, revision_map,
	file_to_revision_map)

	def ParseRevision(self, revision_url, githash, revision_map,
	file_to_revision_map):

	# Retrieve data from the URL, return if it fails.
	url = revision_url % githash
	response = crash_utils.GetDataFromURL(url + '?format=json')
	if not response:
	return

	# Load JSON object from the string. If it fails, terminate the function.
	json_revision = crash_utils.LoadJSON(response[5:])
	if not json_revision:
	return

	# Create a map representing object and get githash from the JSON object.
	revision = {}
	githash = json_revision['commit']

	# Set author, message and URL of this CL.
	revision['author'] = json_revision['author']['name']
	revision['time'] = json_revision['author']['time']
	revision['message'] = json_revision['message']
	revision['url'] = url

	# Iterate through the changed files.
	for diff in json_revision['tree_diff']:
	file_path = diff['new_path']
	file_change_type = diff['type']

	# Normalize file action so that it fits with svn_repository_parser.
	file_change_type = _ConvertToFileChangeType(file_change_type)

	# Add the file to the map.
	if file_path not in file_to_revision_map:
	file_to_revision_map[file_path] = []
	file_to_revision_map[file_path].append((githash, file_change_type))

	# Add this CL to the map.
	revision_map[githash] = revision

	return

	def ParseLineDiff(self, path, component, file_change_type, githash):
	changed_line_numbers = []
	changed_line_contents = []
	base_url = self.component_to_url_map[component]['repository']
	backup_url = (base_url + self.url_parts_map['revision_url']) % githash

	# If the file is added (not modified), treat it as if it is not changed.
	if file_change_type in ('A', 'C', 'R'):
	# TODO(stgao): Maybe return whole file change for Add, Rename, and Copy?
	return (backup_url, changed_line_numbers, changed_line_contents)

	# Retrieves the diff data from URL, and if it fails, return emptry lines.
	url = (base_url + self.url_parts_map['diff_url']) % (githash, path)
	data = crash_utils.GetDataFromURL(url + '?format=text')
	if not data:
	return (backup_url, changed_line_numbers, changed_line_contents)

	# Decode the returned object to line diff info
	diff = base64.b64decode(data).splitlines()

	# Iterate through the lines in diff. Set current line to -1 so that we know
	# that current line is part of the diff chunk.
	current_line = -1
	for line in diff:
	line = line.strip()

	# If line starts with @@, a new chunk starts.
	if line.startswith('@@'):
	current_line = int(line.split('+')[1].split(',')[0])

	# If we are in a chunk.
	elif current_line != -1:
	# If line is either added or modified.
	if line.startswith('+'):
	changed_line_numbers.append(current_line)
	changed_line_contents.append(line[2:])

	# Do not increment current line if the change is 'delete'.
	if not line.startswith('-'):
	current_line += 1

	# Return url without '?format=json'
	return (url, changed_line_numbers, changed_line_contents)

	def ParseBlameInfo(self, component, file_path, line, revision):
	base_url = self.component_to_url_map[component]['repository']

	# Retrieve blame JSON file from googlesource. If it fails, return None.
	url_part = self.url_parts_map['blame_url'] % (revision, file_path)
	blame_url = base_url + url_part
	json_string = crash_utils.GetDataFromURL(blame_url)
	if not json_string:
	return

	# Parse JSON object from the string. The returned string should
	# start with ")}]'\n", so start from the 6th character.
	annotation = crash_utils.LoadJSON(json_string[5:])
	if not annotation:
	return

	# Go through the regions, which is a list of consecutive lines with same
	# author/revision.
	for blame_line in annotation['regions']:
	start = blame_line['start']
	count = blame_line['count']

	# For each region, check if the line we want the blame info of is in this
	# region.
	if start <= line and line <= start + count - 1:
	# If we are in the right region, get the information from the line.
	revision = blame_line['commit']
	author = blame_line['author']['name']
	revision_url_parts = self.url_parts_map['revision_url'] % revision
	revision_url = base_url + revision_url_parts
	# TODO(jeun): Add a way to get content from JSON object.
	content = None

	(revision_info, _) = self.ParseChangelog(component, revision, revision)
	message = revision_info[revision]['message']
	time = revision_info[revision]['time']
	return (content, revision, author, revision_url, message, time)

	# Return none if the region does not exist.
	return None