blob: b6f1132c16d4c05f054bd84a9b0ca4feda0e7b57 [file] [log] [blame]
#!/usr/bin/env python3
# Copyright 2017 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import argparse
import colorsys
import difflib
import html
import random
import os
import re
import subprocess
import sys
import tempfile
import textwrap
import webbrowser
class TokenContext(object):
"""Metadata about a token.
Attributes:
row: Row index of the token in the data file.
column: Column index of the token in the data file.
token: The token string.
commit: A Commit object that corresponds to the commit that added
this token.
"""
def __init__(self, row, column, token, commit=None):
self.row = row
self.column = column
self.token = token
self.commit = commit
class Commit(object):
"""Commit data.
Attributes:
hash: The commit hash.
author_name: The author's name.
author_email: the author's email.
author_date: The date and time the author created this commit.
message: The commit message.
diff: The commit diff.
"""
def __init__(self, hash, author_name, author_email, author_date, message,
diff):
self.hash = hash
self.author_name = author_name
self.author_email = author_email
self.author_date = author_date
self.message = message
self.diff = diff
def tokenize_data(data, tokenize_by_char, tokenize_whitespace):
"""Tokenizes |data|.
Args:
data: String to tokenize.
tokenize_by_char: If true, individual characters are treated as tokens.
Otherwise, tokens are either symbols or strings of both alphanumeric
characters and underscores.
tokenize_whitespace: Treat non-newline whitespace characters as tokens.
Returns:
A list of lists of TokenContexts. Each list represents a line.
"""
contexts = []
in_identifier = False
identifier_start = 0
identifier = ''
row = 0
column = 0
line_contexts = []
for c in data:
if not tokenize_by_char and (c.isalnum() or c == '_'):
if in_identifier:
identifier += c
else:
in_identifier = True
identifier_start = column
identifier = c
else:
if in_identifier:
line_contexts.append(TokenContext(row, identifier_start, identifier))
in_identifier = False
if not c.isspace() or (tokenize_whitespace and c != '\n'):
line_contexts.append(TokenContext(row, column, c))
if c == '\n':
row += 1
column = 0
contexts.append(line_contexts)
line_tokens = []
line_contexts = []
else:
column += 1
contexts.append(line_contexts)
return contexts
def compute_unified_diff(old_tokens, new_tokens):
"""Computes the diff between |old_tokens| and |new_tokens|.
Args:
old_tokens: Token strings corresponding to the old data.
new_tokens: Token strings corresponding to the new data.
Returns:
The diff, in unified diff format.
"""
return difflib.unified_diff(old_tokens, new_tokens, n=0, lineterm='')
def parse_chunk_header_file_range(file_range):
"""Parses a chunk header file range.
Diff chunk headers have the form:
@@ -<file-range> +<file-range> @@
File ranges have the form:
<start line number>,<number of lines changed>
Args:
file_range: A chunk header file range.
Returns:
A tuple (range_start, range_end). The endpoints are adjusted such that
iterating over [range_start, range_end) will give the changed indices.
"""
if ',' in file_range:
file_range_parts = file_range.split(',')
start = int(file_range_parts[0])
amount = int(file_range_parts[1])
if amount == 0:
return (start, start)
return (start - 1, start + amount - 1)
else:
return (int(file_range) - 1, int(file_range))
def compute_changed_token_indices(previous_tokens, current_tokens):
"""Computes changed and added tokens.
Args:
previous_tokens: Tokens corresponding to the old file.
current_tokens: Tokens corresponding to the new file.
Returns:
A tuple (added_tokens, changed_tokens).
added_tokens: A list of indices into |current_tokens|.
changed_tokens: A map of indices into |current_tokens| to
indices into |previous_tokens|.
"""
prev_file_chunk_end = 0
prev_patched_chunk_end = 0
added_tokens = []
changed_tokens = {}
for line in compute_unified_diff(previous_tokens, current_tokens):
if line.startswith("@@"):
parts = line.split(' ')
removed = parts[1].lstrip('-')
removed_start, removed_end = parse_chunk_header_file_range(removed)
added = parts[2].lstrip('+')
added_start, added_end = parse_chunk_header_file_range(added)
for i in range(added_start, added_end):
added_tokens.append(i)
for i in range(0, removed_start - prev_patched_chunk_end):
changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i
prev_patched_chunk_end = removed_end
prev_file_chunk_end = added_end
for i in range(0, len(previous_tokens) - prev_patched_chunk_end):
changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i
return added_tokens, changed_tokens
def flatten_nested_list(l):
"""Flattens a list and provides a mapping from elements in the list back
into the nested list.
Args:
l: A list of lists.
Returns:
A tuple (flattened, index_to_position):
flattened: The flattened list.
index_to_position: A list of pairs (r, c) such that
index_to_position[i] == (r, c); flattened[i] == l[r][c]
"""
flattened = []
index_to_position = {}
r = 0
c = 0
for nested_list in l:
for element in nested_list:
index_to_position[len(flattened)] = (r, c)
flattened.append(element)
c += 1
r += 1
c = 0
return (flattened, index_to_position)
def compute_changed_token_positions(previous_tokens, current_tokens):
"""Computes changed and added token positions.
Args:
previous_tokens: A list of lists of token strings. Lines in the file
correspond to the nested lists.
current_tokens: A list of lists of token strings. Lines in the file
correspond to the nested lists.
Returns:
A tuple (added_token_positions, changed_token_positions):
added_token_positions: A list of pairs that index into |current_tokens|.
changed_token_positions: A map from pairs that index into
|current_tokens| to pairs that index into |previous_tokens|.
"""
flat_previous_tokens, previous_index_to_position = flatten_nested_list(
previous_tokens)
flat_current_tokens, current_index_to_position = flatten_nested_list(
current_tokens)
added_indices, changed_indices = compute_changed_token_indices(
flat_previous_tokens, flat_current_tokens)
added_token_positions = [current_index_to_position[i] for i in added_indices]
changed_token_positions = {
current_index_to_position[current_i]:
previous_index_to_position[changed_indices[current_i]]
for current_i in changed_indices
}
return (added_token_positions, changed_token_positions)
def parse_chunks_from_diff(diff):
"""Returns a generator of chunk data from a diff.
Args:
diff: A list of strings, with each string being a line from a diff
in unified diff format.
Returns:
A generator of tuples (added_lines_start, added_lines_end, removed_lines)
"""
it = iter(diff)
for line in it:
while not line.startswith('@@'):
line = next(it)
parts = line.split(' ')
previous_start, previous_end = parse_chunk_header_file_range(
parts[1].lstrip('-'))
current_start, current_end = parse_chunk_header_file_range(
parts[2].lstrip('+'))
in_delta = False
added_lines_start = None
added_lines_end = None
removed_lines = []
while previous_start < previous_end or current_start < current_end:
line = next(it)
firstchar = line[0]
line = line[1:]
if not in_delta and (firstchar == '-' or firstchar == '+'):
in_delta = True
added_lines_start = current_start
added_lines_end = current_start
removed_lines = []
if firstchar == '-':
removed_lines.append(line)
previous_start += 1
elif firstchar == '+':
current_start += 1
added_lines_end = current_start
elif firstchar == ' ':
if in_delta:
in_delta = False
yield (added_lines_start, added_lines_end, removed_lines)
previous_start += 1
current_start += 1
if in_delta:
yield (added_lines_start, added_lines_end, removed_lines)
def should_skip_commit(commit):
"""Decides if |commit| should be skipped when computing the blame.
Commit 5d4451e deleted all files in the repo except for DEPS. The
next commit, 1e7896, brought them back. This is a hack to skip
those commits (except for the files they modified). If we did not
do this, changes would be incorrectly attributed to 1e7896.
Args:
commit: A Commit object.
Returns:
A boolean indicating if this commit should be skipped.
"""
banned_commits = [
'1e78967ed2f1937b3809c19d91e7dd62d756d307',
'5d4451ebf298d9d71f716cc0135f465cec41fcd0',
]
if commit.hash not in banned_commits:
return False
banned_commits_file_exceptions = [
'DEPS',
'chrome/browser/ui/views/file_manager_dialog_browsertest.cc',
]
for line in commit.diff:
if line.startswith('---') or line.startswith('+++'):
if line.split(' ')[1] in banned_commits_file_exceptions:
return False
elif line.startswith('@@'):
return True
assert False
def generate_substrings(file):
"""Generates substrings from a file stream, where substrings are
separated by '\0'.
For example, the input:
'a\0bc\0\0\0d\0'
would produce the output:
['a', 'bc', 'd']
Args:
file: A readable file.
"""
BUF_SIZE = 448 # Experimentally found to be pretty fast.
data = []
while True:
buf = file.read(BUF_SIZE)
parts = buf.split(b'\0')
data.append(parts[0])
if len(parts) > 1:
joined = b''.join(data)
if joined != b'':
yield joined.decode()
for i in range(1, len(parts) - 1):
if parts[i] != b'':
yield parts[i].decode()
data = [parts[-1]]
if len(buf) < BUF_SIZE:
joined = b''.join(data)
if joined != b'':
yield joined.decode()
return
def generate_commits(git_log_stdout):
"""Parses git log output into a stream of Commit objects.
"""
substring_generator = generate_substrings(git_log_stdout)
try:
while True:
hash = next(substring_generator)
author_name = next(substring_generator)
author_email = next(substring_generator)
author_date = next(substring_generator)
message = next(substring_generator).rstrip('\n')
diff = next(substring_generator).split('\n')[1:-1]
yield Commit(hash, author_name, author_email, author_date, message, diff)
except StopIteration:
pass
def uberblame_aux(file_name, git_log_stdout, data, tokenization_method):
"""Computes the uberblame of file |file_name|.
Args:
file_name: File to uberblame.
git_log_stdout: A file object that represents the git log output.
data: A string containing the data of file |file_name|.
tokenization_method: A function that takes a string and returns a list of
TokenContexts.
Returns:
A tuple (data, blame).
data: File contents.
blame: A list of TokenContexts.
"""
blame = tokenization_method(data)
blamed_tokens = 0
uber_blame = (data, blame[:])
for commit in generate_commits(git_log_stdout):
if should_skip_commit(commit):
continue
offset = 0
for (added_lines_start, added_lines_end,
removed_lines) in parse_chunks_from_diff(commit.diff):
added_lines_start += offset
added_lines_end += offset
previous_contexts = [
token_lines
for line_previous in removed_lines
for token_lines in tokenization_method(line_previous)
]
previous_tokens = [[context.token for context in contexts]
for contexts in previous_contexts]
current_contexts = blame[added_lines_start:added_lines_end]
current_tokens = [[context.token for context in contexts]
for contexts in current_contexts]
added_token_positions, changed_token_positions = (
compute_changed_token_positions(previous_tokens, current_tokens))
for r, c in added_token_positions:
current_contexts[r][c].commit = commit
blamed_tokens += 1
for r, c in changed_token_positions:
pr, pc = changed_token_positions[(r, c)]
previous_contexts[pr][pc] = current_contexts[r][c]
assert added_lines_start <= added_lines_end <= len(blame)
current_blame_size = len(blame)
blame[added_lines_start:added_lines_end] = previous_contexts
offset += len(blame) - current_blame_size
assert blame == [] or blame == [[]]
return uber_blame
def uberblame(file_name, revision, tokenization_method):
"""Computes the uberblame of file |file_name|.
Args:
file_name: File to uberblame.
revision: The revision to start the uberblame at.
tokenization_method: A function that takes a string and returns a list of
TokenContexts.
Returns:
A tuple (data, blame).
data: File contents.
blame: A list of TokenContexts.
"""
DIFF_CONTEXT = 3
cmd_git_log = [
'git', 'log', '--minimal', '--no-prefix', '--follow', '-m',
'--first-parent', '-p',
'-U%d' % DIFF_CONTEXT, '-z', '--format=%x00%H%x00%an%x00%ae%x00%ad%x00%B',
revision, '--', file_name
]
git_log = subprocess.Popen(
cmd_git_log, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
data = subprocess.check_output(
['git', 'show', '%s:%s' % (revision, file_name)]).decode()
data, blame = uberblame_aux(file_name, git_log.stdout, data,
tokenization_method)
stderr = git_log.communicate()[1].decode()
if git_log.returncode != 0:
raise subprocess.CalledProcessError(git_log.returncode, cmd_git_log, stderr)
return data, blame
def generate_pastel_color():
"""Generates a random color from a nice looking pastel palette.
Returns:
The color, formatted as hex string. For example, white is "#FFFFFF".
"""
(h, l, s) = (random.uniform(0, 1), random.uniform(0.8, 0.9), random.uniform(
0.5, 1))
(r, g, b) = colorsys.hls_to_rgb(h, l, s)
return "#%0.2X%0.2X%0.2X" % (int(r * 255), int(g * 255), int(b * 255))
def colorize_diff(diff):
"""Colorizes a diff for use in an HTML page.
Args:
diff: The diff, in unified diff format, as a list of line strings.
Returns:
The HTML-formatted diff, as a string. The diff will already be escaped.
"""
colorized = []
for line in diff:
escaped = html.escape(line.replace('\r', ''), quote=True)
if line.startswith('+'):
colorized.append('<span class=\\"addition\\">%s</span>' % escaped)
elif line.startswith('-'):
colorized.append('<span class=\\"deletion\\">%s</span>' % escaped)
elif line.startswith('@@'):
context_begin = escaped.find('@@', 2)
assert context_begin != -1
colorized.append(
'<span class=\\"chunk_meta\\">%s</span>'
'<span class=\\"chunk_context\\">%s</span'
% (escaped[0:context_begin + 2], escaped[context_begin + 2:]))
elif line.startswith('diff') or line.startswith('index'):
colorized.append('<span class=\\"file_header\\">%s</span>' % escaped)
else:
colorized.append('<span class=\\"context_line\\">%s</span>' % escaped)
return '\n'.join(colorized)
def create_visualization(data, blame):
"""Creates a web page to visualize |blame|.
Args:
data: The data file as returned by uberblame().
blame: A list of TokenContexts as returned by uberblame().
Returns:
The HTML for the generated page, as a string.
"""
# Use the same seed for the color generator on each run so that
# loading the same blame of the same file twice will result in the
# same generated HTML page.
random.seed(0x52937865ec62d1ea)
page = """\
<html>
<head>
<style>
body {
font-family: monospace;
}
pre {
display: inline;
}
.token {
outline: 1pt solid #00000030;
outline-offset: -1pt;
cursor: pointer;
}
.addition {
color: #080;
}
.deletion {
color: #c00;
}
.chunk_meta {
color: #099;
}
.context_line .chunk_context {
// Just normal text.
}
.file_header {
font-weight: bold;
}
#linenums {
text-align: right;
}
#file_display {
position: absolute;
left: 0;
top: 0;
width: 50%%;
height: 100%%;
overflow: scroll;
}
#commit_display_container {
position: absolute;
left: 50%%;
top: 0;
width: 50%%;
height: 100%%;
overflow: scroll;
}
</style>
<script>
commit_data = %s;
function display_commit(hash) {
var e = document.getElementById("commit_display");
e.innerHTML = commit_data[hash]
}
</script>
</head>
<body>
<div id="file_display">
<table>
<tbody>
<tr>
<td valign="top" id="linenums">
<pre>%s</pre>
</td>
<td valign="top">
<pre>%s</pre>
</td>
</tr>
</tbody>
</table>
</div>
<div id="commit_display_container" valign="top">
<pre id="commit_display" />
</div>
</body>
</html>
"""
page = textwrap.dedent(page)
commits = {}
lines = []
commit_colors = {}
blame_index = 0
blame = [context for contexts in blame for context in contexts]
row = 0
lastline = ''
for line in data.split('\n'):
lastline = line
column = 0
for c in line + '\n':
if blame_index < len(blame):
token_context = blame[blame_index]
if (row == token_context.row and
column == token_context.column + len(token_context.token)):
if (blame_index + 1 == len(blame) or blame[blame_index].commit.hash !=
blame[blame_index + 1].commit.hash):
lines.append('</span>')
blame_index += 1
if blame_index < len(blame):
token_context = blame[blame_index]
if row == token_context.row and column == token_context.column:
if (blame_index == 0 or blame[blame_index - 1].commit.hash !=
blame[blame_index].commit.hash):
hash = token_context.commit.hash
commits[hash] = token_context.commit
if hash not in commit_colors:
commit_colors[hash] = generate_pastel_color()
color = commit_colors[hash]
lines.append(('<span class="token" style="background-color: %s" ' +
'onclick="display_commit(&quot;%s&quot;)">') % (color,
hash))
lines.append(html.escape(c))
column += 1
row += 1
commit_data = ['{\n']
commit_display_format = """\
commit: {hash}
Author: {author_name} <{author_email}>
Date: {author_date}
{message}
"""
commit_display_format = textwrap.dedent(commit_display_format)
links = re.compile(r'(https?:\/\/\S+)')
for hash in commits:
commit = commits[hash]
commit_display = commit_display_format.format(
hash=hash,
author_name=commit.author_name,
author_email=commit.author_email,
author_date=commit.author_date,
message=commit.message)
commit_display = html.escape(commit_display, quote=True)
commit_display += colorize_diff(commit.diff)
commit_display = re.sub(links, '<a href=\\"\\1\\">\\1</a>', commit_display)
commit_display = commit_display.replace('\n', '\\n')
commit_data.append('"%s": "%s",\n' % (hash, commit_display))
commit_data.append('}')
commit_data = ''.join(commit_data)
line_nums = range(1, row if lastline.strip() == '' else row + 1)
line_nums = '\n'.join([str(num) for num in line_nums])
lines = ''.join(lines)
return page % (commit_data, line_nums, lines)
def show_visualization(page):
"""Display |html| in a web browser.
Args:
html: The contents of the file to display, as a string.
"""
# Keep the temporary file around so the browser has time to open it.
# TODO(thomasanderson): spin up a temporary web server to serve this
# file so we don't have to leak it.
html_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html')
html_file.write(page.encode())
html_file.flush()
if sys.platform.startswith('linux'):
# Don't show any messages when starting the browser.
saved_stdout = os.dup(1)
saved_stderr = os.dup(2)
os.close(1)
os.close(2)
os.open(os.devnull, os.O_RDWR)
os.open(os.devnull, os.O_RDWR)
webbrowser.open('file://' + html_file.name)
if sys.platform.startswith('linux'):
os.dup2(saved_stdout, 1)
os.dup2(saved_stderr, 2)
os.close(saved_stdout)
os.close(saved_stderr)
def main(argv):
parser = argparse.ArgumentParser(
description='Show what revision last modified each token of a file.')
parser.add_argument(
'revision',
default='HEAD',
nargs='?',
help='show only commits starting from a revision')
parser.add_argument('file', help='the file to uberblame')
parser.add_argument(
'--skip-visualization',
action='store_true',
help='do not display the blame visualization in a web browser')
parser.add_argument(
'--tokenize-by-char',
action='store_true',
help='treat individual characters as tokens')
parser.add_argument(
'--tokenize-whitespace',
action='store_true',
help='also blame non-newline whitespace characters')
args = parser.parse_args(argv)
def tokenization_method(data):
return tokenize_data(data, args.tokenize_by_char, args.tokenize_whitespace)
data, blame = uberblame(args.file, args.revision, tokenization_method)
html = create_visualization(data, blame)
if not args.skip_visualization:
show_visualization(html)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))