diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2017-11-20 15:06:40 +0100 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2017-11-22 11:48:58 +0000 |
commit | daa093eea7c773db06799a13bd7e4e2e2a9f8f14 (patch) | |
tree | 96cc5e7b9194c1b29eab927730bfa419e7111c25 /chromium/tools/uberblame.py | |
parent | be59a35641616a4cf23c4a13fa0632624b021c1b (diff) | |
download | qtwebengine-chromium-daa093eea7c773db06799a13bd7e4e2e2a9f8f14.tar.gz |
BASELINE: Update Chromium to 63.0.3239.58
Change-Id: Ia93b322a00ba4dd4004f3bcf1254063ba90e1605
Reviewed-by: Alexandru Croitor <alexandru.croitor@qt.io>
Diffstat (limited to 'chromium/tools/uberblame.py')
-rwxr-xr-x | chromium/tools/uberblame.py | 556 |
1 files changed, 556 insertions, 0 deletions
diff --git a/chromium/tools/uberblame.py b/chromium/tools/uberblame.py new file mode 100755 index 00000000000..282bb1f5d46 --- /dev/null +++ b/chromium/tools/uberblame.py @@ -0,0 +1,556 @@ +#!/usr/bin/env python +# Copyright 2017 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +import argparse +import cgi +import colorsys +import difflib +import random +import os +import re +import subprocess +import sys +import tempfile +import textwrap +import webbrowser + + +class TokenContext(object): + """Metadata about a token. + + Attributes: + row: Row index of the token in the data file. + column: Column index of the token in the data file. + token: The token string. + commit: Hash of the git commit that added this token. + """ + def __init__(self, row, column, token, commit=None): + self.row = row + self.column = column + self.token = token + self.commit = commit + + +class Commit(object): + """Commit data. + + Attributes: + hash: The commit hash. + diff: The commit diff. + """ + def __init__(self, hash, diff): + self.hash = hash + self.diff = diff + + +def tokenize_data(data): + """Tokenizes |data|. + + Args: + data: String to tokenize. + + Returns: + A list of TokenContexts. + """ + contexts = [] + in_identifier = False + identifier_start = 0 + identifier = '' + row = 0 + column = 0 + line_contexts = [] + + for c in data + '\n': + if c.isalnum() or c == '_': + if in_identifier: + identifier += c + else: + in_identifier = True + identifier_start = column + identifier = c + else: + if in_identifier: + line_contexts.append( + TokenContext(row, identifier_start, identifier)) + in_identifier = False + if not c.isspace(): + line_contexts.append(TokenContext(row, column, c)) + + if c == '\n': + row += 1 + column = 0 + contexts.append(line_contexts) + line_tokens = [] + line_contexts = [] + else: + column += 1 + return contexts + + +def compute_unified_diff(old_tokens, new_tokens): + """Computes the diff between |old_tokens| and |new_tokens|. + + Args: + old_tokens: Token strings corresponding to the old data. + new_tokens: Token strings corresponding to the new data. + + Returns: + The diff, in unified diff format. + """ + return difflib.unified_diff(old_tokens, new_tokens, n=0, lineterm='') + + +def parse_chunk_header_file_range(file_range): + """Parses a chunk header file range. + + Diff chunk headers have the form: + @@ -<file-range> +<file-range> @@ + File ranges have the form: + <start line number>,<number of lines changed> + + Args: + file_range: A chunk header file range. + + Returns: + A tuple (range_start, range_end). The endpoints are adjusted such that + iterating over [range_start, range_end) will give the changed indices. + """ + if ',' in file_range: + file_range_parts = file_range.split(',') + start = int(file_range_parts[0]) + amount = int(file_range_parts[1]) + if amount == 0: + return (start, start) + return (start - 1, start + amount - 1) + else: + return (int(file_range) - 1, int(file_range)) + + +def compute_changed_token_indices(previous_tokens, current_tokens): + """Computes changed and added tokens. + + Args: + previous_tokens: Tokens corresponding to the old file. + current_tokens: Tokens corresponding to the new file. + + Returns: + A tuple (added_tokens, changed_tokens). + added_tokens: A list of indices into |current_tokens|. + changed_tokens: A map of indices into |current_tokens| to + indices into |previous_tokens|. + """ + prev_file_chunk_end = 0 + prev_patched_chunk_end = 0 + added_tokens = [] + changed_tokens = {} + for line in compute_unified_diff(previous_tokens, current_tokens): + if line.startswith("@@"): + parts = line.split(' ') + removed = parts[1].lstrip('-') + removed_start, removed_end = parse_chunk_header_file_range(removed) + added = parts[2].lstrip('+') + added_start, added_end = parse_chunk_header_file_range(added) + for i in range(added_start, added_end): + added_tokens.append(i) + for i in range(0, removed_start - prev_patched_chunk_end): + changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i + prev_patched_chunk_end = removed_end + prev_file_chunk_end = added_end + for i in range(0, len(previous_tokens) - prev_patched_chunk_end): + changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i + return added_tokens, changed_tokens + + +def flatten_nested_list(l): + """Flattens a list and provides a mapping from elements in the list back + into the nested list. + + Args: + l: A list of lists. + + Returns: + A tuple (flattened, index_to_position): + flattened: The flattened list. + index_to_position: A list of pairs (r, c) such that + index_to_position[i] == (r, c); flattened[i] == l[r][c] + """ + flattened = [] + index_to_position = {} + r = 0 + c = 0 + for nested_list in l: + for element in nested_list: + index_to_position[len(flattened)] = (r, c) + flattened.append(element) + c += 1 + r += 1 + c = 0 + return (flattened, index_to_position) + + +def compute_changed_token_positions(previous_tokens, current_tokens): + """Computes changed and added token positions. + + Args: + previous_tokens: A list of lists of token strings. Lines in the file + correspond to the nested lists. + current_tokens: A list of lists of token strings. Lines in the file + correspond to the nested lists. + + Returns: + A tuple (added_token_positions, changed_token_positions): + added_token_positions: A list of pairs that index into |current_tokens|. + changed_token_positions: A map from pairs that index into + |current_tokens| to pairs that index into |previous_tokens|. + """ + flat_previous_tokens, previous_index_to_position = flatten_nested_list( + previous_tokens) + flat_current_tokens, current_index_to_position = flatten_nested_list( + current_tokens) + added_indices, changed_indices = compute_changed_token_indices( + flat_previous_tokens, flat_current_tokens) + added_token_positions = [current_index_to_position[i] for i in added_indices] + changed_token_positions = { + current_index_to_position[current_i]: + previous_index_to_position[changed_indices[current_i]] + for current_i in changed_indices + } + return (added_token_positions, changed_token_positions) + + +def parse_chunks_from_diff(diff): + """Returns a generator of chunk data from a diff. + + Args: + diff: A list of strings, with each string being a line from a diff + in unified diff format. + + Returns: + A generator of tuples (added_lines_start, added_lines_end, + removed_lines, removed_lines_start) + """ + in_chunk = False + chunk_previous = [] + previous_start = None + current_start = None + current_end = None + for line in diff: + if line.startswith('@@'): + if in_chunk: + yield (current_start, current_end, + chunk_previous, previous_start) + parts = line.split(' ') + previous = parts[1].lstrip('-') + previous_start, _ = parse_chunk_header_file_range(previous) + current = parts[2].lstrip('+') + current_start, current_end = parse_chunk_header_file_range(current) + in_chunk = True + chunk_previous = [] + elif in_chunk and line.startswith('-'): + chunk_previous.append(line[1:]) + if current_start != None: + yield (current_start, current_end, + chunk_previous, previous_start) + + +def should_skip_commit(commit): + """Decides if |commit| should be skipped when computing the blame. + + Commit 5d4451e deleted all files in the repo except for DEPS. The + next commit, 1e7896, brought them back. This is a hack to skip + those commits (except for the files they modified). If we did not + do this, changes would be incorrectly attributed to 1e7896. + + Args: + commit: A Commit object. + + Returns: + A boolean indicating if this commit should be skipped. + """ + banned_commits = [ + '1e78967ed2f1937b3809c19d91e7dd62d756d307', + '5d4451ebf298d9d71f716cc0135f465cec41fcd0', + ] + if commit.hash not in banned_commits: + return False + banned_commits_file_exceptions = [ + 'DEPS', + 'chrome/browser/ui/views/file_manager_dialog_browsertest.cc', + ] + for line in commit.diff: + if line.startswith('---') or line.startswith('+++'): + if line.split(' ')[1] in banned_commits_file_exceptions: + return False + elif line.startswith('@@'): + return True + assert False + + +def generate_substrings(file): + """Generates substrings from a file stream, where substrings are + separated by '\0'. + + For example, the input: + 'a\0bc\0\0\0d\0' + would produce the output: + ['a', 'bc', 'd'] + + Args: + file: A readable file. + """ + data = '' + while True: + ch = file.read(1) + if ch == '': + break + if ch == '\0': + if data != '': + yield data + data = '' + else: + data += ch + if data != '': + yield data + + +def generate_commits(git_log_stdout): + """Parses git log output into a stream of Commit objects. + """ + substring_generator = generate_substrings(git_log_stdout) + while True: + hash = substring_generator.next().strip('\n') + diff = substring_generator.next().strip('\n').split('\n') + yield Commit(hash, diff) + + +def uberblame_aux(file_name, git_log_stdout, data): + """Computes the uberblame of file |file_name|. + + Args: + file_name: File to uberblame. + git_log_stdout: A file object that represents the git log output. + data: A string containing the data of file |file_name|. + + Returns: + A tuple (data, blame). + data: File contents. + blame: A list of TokenContexts. + """ + blame = tokenize_data(data) + + blamed_tokens = 0 + total_tokens = len(blame) + uber_blame = (data, blame[:]) + + for commit in generate_commits(git_log_stdout): + if should_skip_commit(commit): + continue + + offset = 0 + for (added_lines_start, added_lines_end, removed_lines, + removed_lines_start) in parse_chunks_from_diff(commit.diff): + added_lines_start += offset + added_lines_end += offset + previous_contexts = [token_lines + for line_previous in removed_lines + for token_lines in tokenize_data(line_previous)] + previous_tokens = [ + [context.token for context in contexts] + for contexts in previous_contexts + ] + current_contexts = blame[added_lines_start:added_lines_end] + current_tokens = [ + [context.token for context in contexts] + for contexts in current_contexts + ] + added_token_positions, changed_token_positions = ( + compute_changed_token_positions(previous_tokens, current_tokens)) + for r, c in added_token_positions: + current_contexts[r][c].commit = commit.hash + blamed_tokens += 1 + for r, c in changed_token_positions: + pr, pc = changed_token_positions[(r, c)] + previous_contexts[pr][pc] = current_contexts[r][c] + + assert added_lines_start <= added_lines_end <= len(blame) + current_blame_size = len(blame) + blame[added_lines_start:added_lines_end] = previous_contexts + offset += len(blame) - current_blame_size + + assert blame == [] or blame == [[]] + return uber_blame + + +def uberblame(file_name, revision): + """Computes the uberblame of file |file_name|. + + Args: + file_name: File to uberblame. + revision: The revision to start the uberblame at. + + Returns: + A tuple (data, blame). + data: File contents. + blame: A list of TokenContexts. + """ + cmd_git_log = ['git', 'log', '--minimal', '--no-prefix', '--follow', '-m', + '--first-parent', '-p', '-U0', '-z', '--format=%x00%h', + revision, '--', file_name] + git_log = subprocess.Popen(cmd_git_log, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + data = subprocess.check_output( + ['git', 'show', '%s:%s' % (revision, file_name)]) + data, blame = uberblame_aux(file_name, git_log.stdout, data) + + _, stderr = git_log.communicate() + if git_log.returncode != 0: + raise subprocess.CalledProcessError(git_log.returncode, cmd_git_log, stderr) + return data, blame + + +def generate_pastel_color(): + (h, l, s) = (random.uniform(0, 1), + random.uniform(0.8, 0.9), + random.uniform(0.5, 1)) + (r, g, b) = colorsys.hls_to_rgb(h, l, s) + return "#%0.2X%0.2X%0.2X" % (int(r*255), int(g*255), int(b*255)) + + +def visualize_uberblame(data, blame): + """Creates and displays a web page to visualize |blame|. + + Args: + data: The data file as returned by uberblame(). + blame: A list of TokenContexts as returned by uberblame(). + """ + # Use the same seed for the color generator on each run so that + # loading the same blame of the same file twice will result in the + # same generated HTML page. + random.seed(0x52937865ec62d1ea) + html = """\ + <html> + <head> + <style> + body { + font-family: "Courier New"; + } + pre { + display: inline; + } + a { + color: #000000; + text-decoration: none; + } + span { + outline: 1pt solid #00000030; + outline-offset: -1pt; + } + #linenums { + text-align: right; + } + </style> + </head> + <body> + <table> + <tbody> + <tr> + <td valign="top" id="linenums"> + <pre>%s</pre> + </td> + <td valign="top"> + <pre>%s</pre> + </td> + </tr> + </tbody> + </table> + </body> + </html> + """ + html = textwrap.dedent(html) + lines = [] + commit_colors = {} + blame_index = 0 + blame = [context for contexts in blame for context in contexts] + row = 0 + lastline = '' + for line in data.split('\n'): + lastline = line + column = 0 + for c in line + '\n': + if blame_index < len(blame): + token_context = blame[blame_index] + if (row == token_context.row and + column == token_context.column + len(token_context.token)): + if (blame_index + 1 == len(blame) or + blame[blame_index].commit != blame[blame_index + 1].commit): + lines.append('</a></span>') + blame_index += 1 + if blame_index < len(blame): + token_context = blame[blame_index] + if row == token_context.row and column == token_context.column: + if (blame_index == 0 or + blame[blame_index - 1].commit != blame[blame_index].commit): + commit = token_context.commit + assert commit != None + lines.append(('<a href="https://chromium.googlesource.com/' + + 'chromium/src/+/%s">') % commit) + if commit not in commit_colors: + commit_colors[commit] = generate_pastel_color() + color = commit_colors[commit] + lines.append('<span style="background-color: %s">' % color) + lines.append(cgi.escape(c)) + column += 1 + row += 1 + line_nums = range(1, row if lastline.strip() == '' else row + 1) + line_nums = '\n'.join([str(num) for num in line_nums]) + lines = ''.join(lines) + return html % (line_nums, lines) + + +def show_visualization(html): + """Display |html| in a web browser. + + Args: + html: The contents of the file to display, as a string. + """ + # Keep the temporary file around so the browser has time to open it. + # TODO(thomasanderson): spin up a temporary web server to serve this + # file so we don't have to leak it. + html_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html') + html_file.write(html) + html_file.flush() + if sys.platform.startswith('linux'): + # Don't show any messages when starting the browser. + saved_stdout = os.dup(1) + saved_stderr = os.dup(2) + os.close(1) + os.close(2) + os.open(os.devnull, os.O_RDWR) + os.open(os.devnull, os.O_RDWR) + webbrowser.open('file://' + html_file.name) + if sys.platform.startswith('linux'): + os.dup2(saved_stdout, 1) + os.dup2(saved_stderr, 2) + os.close(saved_stdout) + os.close(saved_stderr) + + +def main(): + parser = argparse.ArgumentParser( + description='Show what revision last modified each token of a file') + parser.add_argument('revision', default='HEAD', nargs='?', + help='Show only commits starting from a revision.') + parser.add_argument('file', help='The file to uberblame.') + args = parser.parse_args() + + data, blame = uberblame(args.file, args.revision) + html = visualize_uberblame(data, blame) + show_visualization(html) + return 0 + + +if __name__ == '__main__': + sys.exit(main()) |