| #!/usr/bin/env python3 |
| # Copyright 2017 The Chromium Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| import argparse |
| import colorsys |
| import difflib |
| import html |
| import random |
| import os |
| import re |
| import subprocess |
| import sys |
| import tempfile |
| import textwrap |
| import webbrowser |
| |
| |
| class TokenContext(object): |
| """Metadata about a token. |
| |
| Attributes: |
| row: Row index of the token in the data file. |
| column: Column index of the token in the data file. |
| token: The token string. |
| commit: A Commit object that corresponds to the commit that added |
| this token. |
| """ |
| |
| def __init__(self, row, column, token, commit=None): |
| self.row = row |
| self.column = column |
| self.token = token |
| self.commit = commit |
| |
| |
| class Commit(object): |
| """Commit data. |
| |
| Attributes: |
| hash: The commit hash. |
| author_name: The author's name. |
| author_email: the author's email. |
| author_date: The date and time the author created this commit. |
| message: The commit message. |
| diff: The commit diff. |
| """ |
| |
| def __init__(self, hash, author_name, author_email, author_date, message, |
| diff): |
| self.hash = hash |
| self.author_name = author_name |
| self.author_email = author_email |
| self.author_date = author_date |
| self.message = message |
| self.diff = diff |
| |
| |
| def tokenize_data(data, tokenize_by_char, tokenize_whitespace): |
| """Tokenizes |data|. |
| |
| Args: |
| data: String to tokenize. |
| tokenize_by_char: If true, individual characters are treated as tokens. |
| Otherwise, tokens are either symbols or strings of both alphanumeric |
| characters and underscores. |
| tokenize_whitespace: Treat non-newline whitespace characters as tokens. |
| |
| Returns: |
| A list of lists of TokenContexts. Each list represents a line. |
| """ |
| contexts = [] |
| in_identifier = False |
| identifier_start = 0 |
| identifier = '' |
| row = 0 |
| column = 0 |
| line_contexts = [] |
| |
| for c in data: |
| if not tokenize_by_char and (c.isalnum() or c == '_'): |
| if in_identifier: |
| identifier += c |
| else: |
| in_identifier = True |
| identifier_start = column |
| identifier = c |
| else: |
| if in_identifier: |
| line_contexts.append(TokenContext(row, identifier_start, identifier)) |
| in_identifier = False |
| if not c.isspace() or (tokenize_whitespace and c != '\n'): |
| line_contexts.append(TokenContext(row, column, c)) |
| |
| if c == '\n': |
| row += 1 |
| column = 0 |
| contexts.append(line_contexts) |
| line_tokens = [] |
| line_contexts = [] |
| else: |
| column += 1 |
| contexts.append(line_contexts) |
| return contexts |
| |
| |
| def compute_unified_diff(old_tokens, new_tokens): |
| """Computes the diff between |old_tokens| and |new_tokens|. |
| |
| Args: |
| old_tokens: Token strings corresponding to the old data. |
| new_tokens: Token strings corresponding to the new data. |
| |
| Returns: |
| The diff, in unified diff format. |
| """ |
| return difflib.unified_diff(old_tokens, new_tokens, n=0, lineterm='') |
| |
| |
| def parse_chunk_header_file_range(file_range): |
| """Parses a chunk header file range. |
| |
| Diff chunk headers have the form: |
| @@ -<file-range> +<file-range> @@ |
| File ranges have the form: |
| <start line number>,<number of lines changed> |
| |
| Args: |
| file_range: A chunk header file range. |
| |
| Returns: |
| A tuple (range_start, range_end). The endpoints are adjusted such that |
| iterating over [range_start, range_end) will give the changed indices. |
| """ |
| if ',' in file_range: |
| file_range_parts = file_range.split(',') |
| start = int(file_range_parts[0]) |
| amount = int(file_range_parts[1]) |
| if amount == 0: |
| return (start, start) |
| return (start - 1, start + amount - 1) |
| else: |
| return (int(file_range) - 1, int(file_range)) |
| |
| |
| def compute_changed_token_indices(previous_tokens, current_tokens): |
| """Computes changed and added tokens. |
| |
| Args: |
| previous_tokens: Tokens corresponding to the old file. |
| current_tokens: Tokens corresponding to the new file. |
| |
| Returns: |
| A tuple (added_tokens, changed_tokens). |
| added_tokens: A list of indices into |current_tokens|. |
| changed_tokens: A map of indices into |current_tokens| to |
| indices into |previous_tokens|. |
| """ |
| prev_file_chunk_end = 0 |
| prev_patched_chunk_end = 0 |
| added_tokens = [] |
| changed_tokens = {} |
| for line in compute_unified_diff(previous_tokens, current_tokens): |
| if line.startswith("@@"): |
| parts = line.split(' ') |
| removed = parts[1].lstrip('-') |
| removed_start, removed_end = parse_chunk_header_file_range(removed) |
| added = parts[2].lstrip('+') |
| added_start, added_end = parse_chunk_header_file_range(added) |
| for i in range(added_start, added_end): |
| added_tokens.append(i) |
| for i in range(0, removed_start - prev_patched_chunk_end): |
| changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i |
| prev_patched_chunk_end = removed_end |
| prev_file_chunk_end = added_end |
| for i in range(0, len(previous_tokens) - prev_patched_chunk_end): |
| changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i |
| return added_tokens, changed_tokens |
| |
| |
| def flatten_nested_list(l): |
| """Flattens a list and provides a mapping from elements in the list back |
| into the nested list. |
| |
| Args: |
| l: A list of lists. |
| |
| Returns: |
| A tuple (flattened, index_to_position): |
| flattened: The flattened list. |
| index_to_position: A list of pairs (r, c) such that |
| index_to_position[i] == (r, c); flattened[i] == l[r][c] |
| """ |
| flattened = [] |
| index_to_position = {} |
| r = 0 |
| c = 0 |
| for nested_list in l: |
| for element in nested_list: |
| index_to_position[len(flattened)] = (r, c) |
| flattened.append(element) |
| c += 1 |
| r += 1 |
| c = 0 |
| return (flattened, index_to_position) |
| |
| |
| def compute_changed_token_positions(previous_tokens, current_tokens): |
| """Computes changed and added token positions. |
| |
| Args: |
| previous_tokens: A list of lists of token strings. Lines in the file |
| correspond to the nested lists. |
| current_tokens: A list of lists of token strings. Lines in the file |
| correspond to the nested lists. |
| |
| Returns: |
| A tuple (added_token_positions, changed_token_positions): |
| added_token_positions: A list of pairs that index into |current_tokens|. |
| changed_token_positions: A map from pairs that index into |
| |current_tokens| to pairs that index into |previous_tokens|. |
| """ |
| flat_previous_tokens, previous_index_to_position = flatten_nested_list( |
| previous_tokens) |
| flat_current_tokens, current_index_to_position = flatten_nested_list( |
| current_tokens) |
| added_indices, changed_indices = compute_changed_token_indices( |
| flat_previous_tokens, flat_current_tokens) |
| added_token_positions = [current_index_to_position[i] for i in added_indices] |
| changed_token_positions = { |
| current_index_to_position[current_i]: |
| previous_index_to_position[changed_indices[current_i]] |
| for current_i in changed_indices |
| } |
| return (added_token_positions, changed_token_positions) |
| |
| |
| def parse_chunks_from_diff(diff): |
| """Returns a generator of chunk data from a diff. |
| |
| Args: |
| diff: A list of strings, with each string being a line from a diff |
| in unified diff format. |
| |
| Returns: |
| A generator of tuples (added_lines_start, added_lines_end, removed_lines) |
| """ |
| it = iter(diff) |
| for line in it: |
| while not line.startswith('@@'): |
| line = next(it) |
| parts = line.split(' ') |
| previous_start, previous_end = parse_chunk_header_file_range( |
| parts[1].lstrip('-')) |
| current_start, current_end = parse_chunk_header_file_range( |
| parts[2].lstrip('+')) |
| |
| in_delta = False |
| added_lines_start = None |
| added_lines_end = None |
| removed_lines = [] |
| while previous_start < previous_end or current_start < current_end: |
| line = next(it) |
| firstchar = line[0] |
| line = line[1:] |
| if not in_delta and (firstchar == '-' or firstchar == '+'): |
| in_delta = True |
| added_lines_start = current_start |
| added_lines_end = current_start |
| removed_lines = [] |
| |
| if firstchar == '-': |
| removed_lines.append(line) |
| previous_start += 1 |
| elif firstchar == '+': |
| current_start += 1 |
| added_lines_end = current_start |
| elif firstchar == ' ': |
| if in_delta: |
| in_delta = False |
| yield (added_lines_start, added_lines_end, removed_lines) |
| previous_start += 1 |
| current_start += 1 |
| if in_delta: |
| yield (added_lines_start, added_lines_end, removed_lines) |
| |
| |
| def should_skip_commit(commit): |
| """Decides if |commit| should be skipped when computing the blame. |
| |
| Commit 5d4451e deleted all files in the repo except for DEPS. The |
| next commit, 1e7896, brought them back. This is a hack to skip |
| those commits (except for the files they modified). If we did not |
| do this, changes would be incorrectly attributed to 1e7896. |
| |
| Args: |
| commit: A Commit object. |
| |
| Returns: |
| A boolean indicating if this commit should be skipped. |
| """ |
| banned_commits = [ |
| '1e78967ed2f1937b3809c19d91e7dd62d756d307', |
| '5d4451ebf298d9d71f716cc0135f465cec41fcd0', |
| ] |
| if commit.hash not in banned_commits: |
| return False |
| banned_commits_file_exceptions = [ |
| 'DEPS', |
| 'chrome/browser/ui/views/file_manager_dialog_browsertest.cc', |
| ] |
| for line in commit.diff: |
| if line.startswith('---') or line.startswith('+++'): |
| if line.split(' ')[1] in banned_commits_file_exceptions: |
| return False |
| elif line.startswith('@@'): |
| return True |
| assert False |
| |
| |
| def generate_substrings(file): |
| """Generates substrings from a file stream, where substrings are |
| separated by '\0'. |
| |
| For example, the input: |
| 'a\0bc\0\0\0d\0' |
| would produce the output: |
| ['a', 'bc', 'd'] |
| |
| Args: |
| file: A readable file. |
| """ |
| BUF_SIZE = 448 # Experimentally found to be pretty fast. |
| data = [] |
| while True: |
| buf = file.read(BUF_SIZE) |
| parts = buf.split(b'\0') |
| data.append(parts[0]) |
| if len(parts) > 1: |
| joined = b''.join(data) |
| if joined != b'': |
| yield joined.decode() |
| for i in range(1, len(parts) - 1): |
| if parts[i] != b'': |
| yield parts[i].decode() |
| data = [parts[-1]] |
| if len(buf) < BUF_SIZE: |
| joined = b''.join(data) |
| if joined != b'': |
| yield joined.decode() |
| return |
| |
| |
| def generate_commits(git_log_stdout): |
| """Parses git log output into a stream of Commit objects. |
| """ |
| substring_generator = generate_substrings(git_log_stdout) |
| try: |
| while True: |
| hash = next(substring_generator) |
| author_name = next(substring_generator) |
| author_email = next(substring_generator) |
| author_date = next(substring_generator) |
| message = next(substring_generator).rstrip('\n') |
| diff = next(substring_generator).split('\n')[1:-1] |
| yield Commit(hash, author_name, author_email, author_date, message, diff) |
| except StopIteration: |
| pass |
| |
| |
| def uberblame_aux(file_name, git_log_stdout, data, tokenization_method): |
| """Computes the uberblame of file |file_name|. |
| |
| Args: |
| file_name: File to uberblame. |
| git_log_stdout: A file object that represents the git log output. |
| data: A string containing the data of file |file_name|. |
| tokenization_method: A function that takes a string and returns a list of |
| TokenContexts. |
| |
| Returns: |
| A tuple (data, blame). |
| data: File contents. |
| blame: A list of TokenContexts. |
| """ |
| blame = tokenization_method(data) |
| |
| blamed_tokens = 0 |
| uber_blame = (data, blame[:]) |
| |
| for commit in generate_commits(git_log_stdout): |
| if should_skip_commit(commit): |
| continue |
| |
| offset = 0 |
| for (added_lines_start, added_lines_end, |
| removed_lines) in parse_chunks_from_diff(commit.diff): |
| added_lines_start += offset |
| added_lines_end += offset |
| previous_contexts = [ |
| token_lines |
| for line_previous in removed_lines |
| for token_lines in tokenization_method(line_previous) |
| ] |
| previous_tokens = [[context.token for context in contexts] |
| for contexts in previous_contexts] |
| current_contexts = blame[added_lines_start:added_lines_end] |
| current_tokens = [[context.token for context in contexts] |
| for contexts in current_contexts] |
| added_token_positions, changed_token_positions = ( |
| compute_changed_token_positions(previous_tokens, current_tokens)) |
| for r, c in added_token_positions: |
| current_contexts[r][c].commit = commit |
| blamed_tokens += 1 |
| for r, c in changed_token_positions: |
| pr, pc = changed_token_positions[(r, c)] |
| previous_contexts[pr][pc] = current_contexts[r][c] |
| |
| assert added_lines_start <= added_lines_end <= len(blame) |
| current_blame_size = len(blame) |
| blame[added_lines_start:added_lines_end] = previous_contexts |
| offset += len(blame) - current_blame_size |
| |
| assert blame == [] or blame == [[]] |
| return uber_blame |
| |
| |
| def uberblame(file_name, revision, tokenization_method): |
| """Computes the uberblame of file |file_name|. |
| |
| Args: |
| file_name: File to uberblame. |
| revision: The revision to start the uberblame at. |
| tokenization_method: A function that takes a string and returns a list of |
| TokenContexts. |
| |
| Returns: |
| A tuple (data, blame). |
| data: File contents. |
| blame: A list of TokenContexts. |
| """ |
| DIFF_CONTEXT = 3 |
| cmd_git_log = [ |
| 'git', 'log', '--minimal', '--no-prefix', '--follow', '-m', |
| '--first-parent', '-p', |
| '-U%d' % DIFF_CONTEXT, '-z', '--format=%x00%H%x00%an%x00%ae%x00%ad%x00%B', |
| revision, '--', file_name |
| ] |
| git_log = subprocess.Popen( |
| cmd_git_log, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
| data = subprocess.check_output( |
| ['git', 'show', '%s:%s' % (revision, file_name)]).decode() |
| data, blame = uberblame_aux(file_name, git_log.stdout, data, |
| tokenization_method) |
| |
| stderr = git_log.communicate()[1].decode() |
| if git_log.returncode != 0: |
| raise subprocess.CalledProcessError(git_log.returncode, cmd_git_log, stderr) |
| return data, blame |
| |
| |
| def generate_pastel_color(): |
| """Generates a random color from a nice looking pastel palette. |
| |
| Returns: |
| The color, formatted as hex string. For example, white is "#FFFFFF". |
| """ |
| (h, l, s) = (random.uniform(0, 1), random.uniform(0.8, 0.9), random.uniform( |
| 0.5, 1)) |
| (r, g, b) = colorsys.hls_to_rgb(h, l, s) |
| return "#%0.2X%0.2X%0.2X" % (int(r * 255), int(g * 255), int(b * 255)) |
| |
| |
| def colorize_diff(diff): |
| """Colorizes a diff for use in an HTML page. |
| |
| Args: |
| diff: The diff, in unified diff format, as a list of line strings. |
| |
| Returns: |
| The HTML-formatted diff, as a string. The diff will already be escaped. |
| """ |
| |
| colorized = [] |
| for line in diff: |
| escaped = html.escape(line.replace('\r', ''), quote=True) |
| if line.startswith('+'): |
| colorized.append('<span class=\\"addition\\">%s</span>' % escaped) |
| elif line.startswith('-'): |
| colorized.append('<span class=\\"deletion\\">%s</span>' % escaped) |
| elif line.startswith('@@'): |
| context_begin = escaped.find('@@', 2) |
| assert context_begin != -1 |
| colorized.append( |
| '<span class=\\"chunk_meta\\">%s</span>' |
| '<span class=\\"chunk_context\\">%s</span' |
| % (escaped[0:context_begin + 2], escaped[context_begin + 2:])) |
| elif line.startswith('diff') or line.startswith('index'): |
| colorized.append('<span class=\\"file_header\\">%s</span>' % escaped) |
| else: |
| colorized.append('<span class=\\"context_line\\">%s</span>' % escaped) |
| return '\n'.join(colorized) |
| |
| |
| def create_visualization(data, blame): |
| """Creates a web page to visualize |blame|. |
| |
| Args: |
| data: The data file as returned by uberblame(). |
| blame: A list of TokenContexts as returned by uberblame(). |
| |
| Returns: |
| The HTML for the generated page, as a string. |
| """ |
| # Use the same seed for the color generator on each run so that |
| # loading the same blame of the same file twice will result in the |
| # same generated HTML page. |
| random.seed(0x52937865ec62d1ea) |
| page = """\ |
| <html> |
| <head> |
| <style> |
| body { |
| font-family: monospace; |
| } |
| pre { |
| display: inline; |
| } |
| .token { |
| outline: 1pt solid #00000030; |
| outline-offset: -1pt; |
| cursor: pointer; |
| } |
| .addition { |
| color: #080; |
| } |
| .deletion { |
| color: #c00; |
| } |
| .chunk_meta { |
| color: #099; |
| } |
| .context_line .chunk_context { |
| // Just normal text. |
| } |
| .file_header { |
| font-weight: bold; |
| } |
| #linenums { |
| text-align: right; |
| } |
| #file_display { |
| position: absolute; |
| left: 0; |
| top: 0; |
| width: 50%%; |
| height: 100%%; |
| overflow: scroll; |
| } |
| #commit_display_container { |
| position: absolute; |
| left: 50%%; |
| top: 0; |
| width: 50%%; |
| height: 100%%; |
| overflow: scroll; |
| } |
| </style> |
| <script> |
| commit_data = %s; |
| function display_commit(hash) { |
| var e = document.getElementById("commit_display"); |
| e.innerHTML = commit_data[hash] |
| } |
| </script> |
| </head> |
| <body> |
| <div id="file_display"> |
| <table> |
| <tbody> |
| <tr> |
| <td valign="top" id="linenums"> |
| <pre>%s</pre> |
| </td> |
| <td valign="top"> |
| <pre>%s</pre> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| </div> |
| <div id="commit_display_container" valign="top"> |
| <pre id="commit_display" /> |
| </div> |
| </body> |
| </html> |
| """ |
| page = textwrap.dedent(page) |
| commits = {} |
| lines = [] |
| commit_colors = {} |
| blame_index = 0 |
| blame = [context for contexts in blame for context in contexts] |
| row = 0 |
| lastline = '' |
| for line in data.split('\n'): |
| lastline = line |
| column = 0 |
| for c in line + '\n': |
| if blame_index < len(blame): |
| token_context = blame[blame_index] |
| if (row == token_context.row and |
| column == token_context.column + len(token_context.token)): |
| if (blame_index + 1 == len(blame) or blame[blame_index].commit.hash != |
| blame[blame_index + 1].commit.hash): |
| lines.append('</span>') |
| blame_index += 1 |
| if blame_index < len(blame): |
| token_context = blame[blame_index] |
| if row == token_context.row and column == token_context.column: |
| if (blame_index == 0 or blame[blame_index - 1].commit.hash != |
| blame[blame_index].commit.hash): |
| hash = token_context.commit.hash |
| commits[hash] = token_context.commit |
| if hash not in commit_colors: |
| commit_colors[hash] = generate_pastel_color() |
| color = commit_colors[hash] |
| lines.append(('<span class="token" style="background-color: %s" ' + |
| 'onclick="display_commit("%s")">') % (color, |
| hash)) |
| lines.append(html.escape(c)) |
| column += 1 |
| row += 1 |
| commit_data = ['{\n'] |
| commit_display_format = """\ |
| commit: {hash} |
| Author: {author_name} <{author_email}> |
| Date: {author_date} |
| |
| {message} |
| |
| """ |
| commit_display_format = textwrap.dedent(commit_display_format) |
| links = re.compile(r'(https?:\/\/\S+)') |
| for hash in commits: |
| commit = commits[hash] |
| commit_display = commit_display_format.format( |
| hash=hash, |
| author_name=commit.author_name, |
| author_email=commit.author_email, |
| author_date=commit.author_date, |
| message=commit.message) |
| commit_display = html.escape(commit_display, quote=True) |
| commit_display += colorize_diff(commit.diff) |
| commit_display = re.sub(links, '<a href=\\"\\1\\">\\1</a>', commit_display) |
| commit_display = commit_display.replace('\n', '\\n') |
| commit_data.append('"%s": "%s",\n' % (hash, commit_display)) |
| commit_data.append('}') |
| commit_data = ''.join(commit_data) |
| line_nums = range(1, row if lastline.strip() == '' else row + 1) |
| line_nums = '\n'.join([str(num) for num in line_nums]) |
| lines = ''.join(lines) |
| return page % (commit_data, line_nums, lines) |
| |
| |
| def show_visualization(page): |
| """Display |html| in a web browser. |
| |
| Args: |
| html: The contents of the file to display, as a string. |
| """ |
| # Keep the temporary file around so the browser has time to open it. |
| # TODO(thomasanderson): spin up a temporary web server to serve this |
| # file so we don't have to leak it. |
| html_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html') |
| html_file.write(page.encode()) |
| html_file.flush() |
| if sys.platform.startswith('linux'): |
| # Don't show any messages when starting the browser. |
| saved_stdout = os.dup(1) |
| saved_stderr = os.dup(2) |
| os.close(1) |
| os.close(2) |
| os.open(os.devnull, os.O_RDWR) |
| os.open(os.devnull, os.O_RDWR) |
| webbrowser.open('file://' + html_file.name) |
| if sys.platform.startswith('linux'): |
| os.dup2(saved_stdout, 1) |
| os.dup2(saved_stderr, 2) |
| os.close(saved_stdout) |
| os.close(saved_stderr) |
| |
| |
| def main(argv): |
| parser = argparse.ArgumentParser( |
| description='Show what revision last modified each token of a file.') |
| parser.add_argument( |
| 'revision', |
| default='HEAD', |
| nargs='?', |
| help='show only commits starting from a revision') |
| parser.add_argument('file', help='the file to uberblame') |
| parser.add_argument( |
| '--skip-visualization', |
| action='store_true', |
| help='do not display the blame visualization in a web browser') |
| parser.add_argument( |
| '--tokenize-by-char', |
| action='store_true', |
| help='treat individual characters as tokens') |
| parser.add_argument( |
| '--tokenize-whitespace', |
| action='store_true', |
| help='also blame non-newline whitespace characters') |
| args = parser.parse_args(argv) |
| |
| def tokenization_method(data): |
| return tokenize_data(data, args.tokenize_by_char, args.tokenize_whitespace) |
| |
| data, blame = uberblame(args.file, args.revision, tokenization_method) |
| html = create_visualization(data, blame) |
| if not args.skip_visualization: |
| show_visualization(html) |
| return 0 |
| |
| |
| if __name__ == '__main__': |
| sys.exit(main(sys.argv[1:])) |