diff options
author | Jelmer Vernooij <jelmer@jelmer.uk> | 2018-03-30 18:24:19 +0100 |
---|---|---|
committer | Jelmer Vernooij <jelmer@jelmer.uk> | 2018-03-30 18:24:19 +0100 |
commit | fe5b3d2b2d1f4298342c1b1a1d5631dbbf9270ce (patch) | |
tree | dba1340fb42fcf39acf3d9e05685ae2d5dd15ee8 /fastimport | |
parent | b65623b0ddba5a4ec99d5576e4888fcc1200de97 (diff) | |
download | python-fastimport-git-fe5b3d2b2d1f4298342c1b1a1d5631dbbf9270ce.tar.gz |
Add fast-import-{filter,query,info} scripts.
Diffstat (limited to 'fastimport')
-rw-r--r-- | fastimport/helpers.py | 69 | ||||
-rw-r--r-- | fastimport/processors/info_processor.py | 286 | ||||
-rw-r--r-- | fastimport/reftracker.py | 68 | ||||
-rw-r--r-- | fastimport/tests/__init__.py | 1 | ||||
-rw-r--r-- | fastimport/tests/test_info_processor.py | 77 |
5 files changed, 501 insertions, 0 deletions
diff --git a/fastimport/helpers.py b/fastimport/helpers.py index abb7014..67072be 100644 --- a/fastimport/helpers.py +++ b/fastimport/helpers.py @@ -194,3 +194,72 @@ class newobject(object): Hook for the future.utils.native() function """ return object(self) + + +def binary_stream(stream): + """Ensure a stream is binary on Windows. + + :return: the stream + """ + try: + import os + if os.name == 'nt': + fileno = getattr(stream, 'fileno', None) + if fileno: + no = fileno() + if no >= 0: # -1 means we're working as subprocess + import msvcrt + msvcrt.setmode(no, os.O_BINARY) + except ImportError: + pass + return stream + + +def invert_dictset(d): + """Invert a dictionary with keys matching a set of values, turned into lists.""" + # Based on recipe from ASPN + result = {} + for k, c in d.items(): + for v in c: + keys = result.setdefault(v, []) + keys.append(k) + return result + + +def invert_dict(d): + """Invert a dictionary with keys matching each value turned into a list.""" + # Based on recipe from ASPN + result = {} + for k, v in d.items(): + keys = result.setdefault(v, []) + keys.append(k) + return result + + +def defines_to_dict(defines): + """Convert a list of definition strings to a dictionary.""" + if defines is None: + return None + result = {} + for define in defines: + kv = define.split('=', 1) + if len(kv) == 1: + result[define.strip()] = 1 + else: + result[kv[0].strip()] = kv[1].strip() + return result + + +def get_source_stream(source): + if source == '-' or source is None: + import sys + stream = binary_stream(sys.stdin) + elif source.endswith('.gz'): + import gzip + stream = gzip.open(source, "rb") + else: + stream = open(source, "rb") + return stream + + + diff --git a/fastimport/processors/info_processor.py b/fastimport/processors/info_processor.py new file mode 100644 index 0000000..28c7300 --- /dev/null +++ b/fastimport/processors/info_processor.py @@ -0,0 +1,286 @@ +# Copyright (C) 2008 Canonical Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +"""Import processor that dump stats about the input (and doesn't import).""" + +from __future__ import absolute_import + +from .. import ( + reftracker, + ) +from ..helpers import ( + invert_dict, + invert_dictset, + ) +from fastimport import ( + commands, + processor, + ) +import stat + + +class InfoProcessor(processor.ImportProcessor): + """An import processor that dumps statistics about the input. + + No changes to the current repository are made. + + As well as providing useful information about an import + stream before importing it, this processor is useful for + benchmarking the speed at which data can be extracted from + the source. + """ + + def __init__(self, params=None, verbose=0, outf=None): + processor.ImportProcessor.__init__(self, params, verbose, + outf=outf) + + def pre_process(self): + # Init statistics + self.cmd_counts = {} + for cmd in commands.COMMAND_NAMES: + self.cmd_counts[cmd] = 0 + self.file_cmd_counts = {} + for fc in commands.FILE_COMMAND_NAMES: + self.file_cmd_counts[fc] = 0 + self.parent_counts = {} + self.max_parent_count = 0 + self.committers = set() + self.separate_authors_found = False + self.symlinks_found = False + self.executables_found = False + self.sha_blob_references = False + self.lightweight_tags = 0 + # Blob usage tracking + self.blobs = {} + for usage in ['new', 'used', 'unknown', 'unmarked']: + self.blobs[usage] = set() + self.blob_ref_counts = {} + # Head tracking + self.reftracker = reftracker.RefTracker() + # Stuff to cache: a map from mark to # of times that mark is merged + self.merges = {} + # Stuff to cache: these are maps from mark to sets + self.rename_old_paths = {} + self.copy_source_paths = {} + + def post_process(self): + # Dump statistics + cmd_names = commands.COMMAND_NAMES + fc_names = commands.FILE_COMMAND_NAMES + self._dump_stats_group("Command counts", + [(c.decode('utf-8'), self.cmd_counts[c]) for c in cmd_names], str) + self._dump_stats_group("File command counts", + [(c.decode('utf-8'), self.file_cmd_counts[c]) for c in fc_names], str) + + # Commit stats + if self.cmd_counts[b'commit']: + p_items = [] + for i in range(self.max_parent_count + 1): + if i in self.parent_counts: + count = self.parent_counts[i] + p_items.append(("parents-%d" % i, count)) + merges_count = len(self.merges) + p_items.append(('total revisions merged', merges_count)) + flags = { + 'separate authors found': self.separate_authors_found, + 'executables': self.executables_found, + 'symlinks': self.symlinks_found, + 'blobs referenced by SHA': self.sha_blob_references, + } + self._dump_stats_group("Parent counts", p_items, str) + self._dump_stats_group("Commit analysis", sorted(flags.items()), _found) + heads = invert_dictset(self.reftracker.heads) + self._dump_stats_group( + "Head analysis", + [(k.decode('utf-8'), + ', '.join([m.decode('utf-8') for m in v])) + for (k, v) in heads.items()], None, + _iterable_as_config_list) + # note("\t%d\t%s" % (len(self.committers), 'unique committers')) + self._dump_stats_group("Merges", self.merges.items(), None) + # We only show the rename old path and copy source paths when -vv + # (verbose=2) is specified. The output here for mysql's data can't + # be parsed currently so this bit of code needs more work anyhow .. + if self.verbose >= 2: + self._dump_stats_group("Rename old paths", + self.rename_old_paths.items(), len, + _iterable_as_config_list) + self._dump_stats_group("Copy source paths", + self.copy_source_paths.items(), len, + _iterable_as_config_list) + + # Blob stats + if self.cmd_counts[b'blob']: + # In verbose mode, don't list every blob used + if self.verbose: + del self.blobs['used'] + self._dump_stats_group("Blob usage tracking", + self.blobs.items(), len, _iterable_as_config_list) + if self.blob_ref_counts: + blobs_by_count = invert_dict(self.blob_ref_counts) + blob_items = sorted(blobs_by_count.items()) + self._dump_stats_group("Blob reference counts", + blob_items, len, _iterable_as_config_list) + + # Other stats + if self.cmd_counts[b'reset']: + reset_stats = { + 'lightweight tags': self.lightweight_tags, + } + self._dump_stats_group("Reset analysis", reset_stats.items()) + + def _dump_stats_group(self, title, items, normal_formatter=None, + verbose_formatter=None): + """Dump a statistics group. + + In verbose mode, do so as a config file so + that other processors can load the information if they want to. + :param normal_formatter: the callable to apply to the value + before displaying it in normal mode + :param verbose_formatter: the callable to apply to the value + before displaying it in verbose mode + """ + if self.verbose: + self.outf.write("[%s]\n" % (title,)) + for name, value in items: + if verbose_formatter is not None: + value = verbose_formatter(value) + if type(name) == str: + name = name.replace(' ', '-') + self.outf.write("%s = %s\n" % (name, value)) + self.outf.write("\n") + else: + self.outf.write("%s:\n" % (title,)) + for name, value in items: + if normal_formatter is not None: + value = normal_formatter(value) + self.outf.write("\t%s\t%s\n" % (value, name)) + + def progress_handler(self, cmd): + """Process a ProgressCommand.""" + self.cmd_counts[cmd.name] += 1 + + def blob_handler(self, cmd): + """Process a BlobCommand.""" + self.cmd_counts[cmd.name] += 1 + if cmd.mark is None: + self.blobs['unmarked'].add(cmd.id) + else: + self.blobs['new'].add(cmd.id) + # Marks can be re-used so remove it from used if already there. + # Note: we definitely do NOT want to remove it from multi if + # it's already in that set. + try: + self.blobs['used'].remove(cmd.id) + except KeyError: + pass + + def checkpoint_handler(self, cmd): + """Process a CheckpointCommand.""" + self.cmd_counts[cmd.name] += 1 + + def commit_handler(self, cmd): + """Process a CommitCommand.""" + self.cmd_counts[cmd.name] += 1 + self.committers.add(cmd.committer) + if cmd.author is not None: + self.separate_authors_found = True + for fc in cmd.iter_files(): + self.file_cmd_counts[fc.name] += 1 + if isinstance(fc, commands.FileModifyCommand): + if fc.mode & 0o111: + self.executables_found = True + if stat.S_ISLNK(fc.mode): + self.symlinks_found = True + if fc.dataref is not None: + if fc.dataref[0] == ':': + self._track_blob(fc.dataref) + else: + self.sha_blob_references = True + elif isinstance(fc, commands.FileRenameCommand): + self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path) + elif isinstance(fc, commands.FileCopyCommand): + self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path) + + # Track the heads + parents = self.reftracker.track_heads(cmd) + + # Track the parent counts + parent_count = len(parents) + try: + self.parent_counts[parent_count] += 1 + except KeyError: + self.parent_counts[parent_count] = 1 + if parent_count > self.max_parent_count: + self.max_parent_count = parent_count + + # Remember the merges + if cmd.merges: + #self.merges.setdefault(cmd.ref, set()).update(cmd.merges) + for merge in cmd.merges: + if merge in self.merges: + self.merges[merge] += 1 + else: + self.merges[merge] = 1 + + def reset_handler(self, cmd): + """Process a ResetCommand.""" + self.cmd_counts[cmd.name] += 1 + if cmd.ref.startswith('refs/tags/'): + self.lightweight_tags += 1 + else: + if cmd.from_ is not None: + self.reftracker.track_heads_for_ref( + cmd.ref, cmd.from_) + + def tag_handler(self, cmd): + """Process a TagCommand.""" + self.cmd_counts[cmd.name] += 1 + + def feature_handler(self, cmd): + """Process a FeatureCommand.""" + self.cmd_counts[cmd.name] += 1 + feature = cmd.feature_name + if feature not in commands.FEATURE_NAMES: + self.warning("feature %s is not supported - parsing may fail" + % (feature,)) + + def _track_blob(self, mark): + if mark in self.blob_ref_counts: + self.blob_ref_counts[mark] += 1 + pass + elif mark in self.blobs['used']: + self.blob_ref_counts[mark] = 2 + self.blobs['used'].remove(mark) + elif mark in self.blobs['new']: + self.blobs['used'].add(mark) + self.blobs['new'].remove(mark) + else: + self.blobs['unknown'].add(mark) + +def _found(b): + """Format a found boolean as a string.""" + return ['no', 'found'][b] + +def _iterable_as_config_list(s): + """Format an iterable as a sequence of comma-separated strings. + + To match what ConfigObj expects, a single item list has a trailing comma. + """ + items = sorted(s) + if len(items) == 1: + return "%s," % (items[0],) + else: + return ", ".join(items) diff --git a/fastimport/reftracker.py b/fastimport/reftracker.py new file mode 100644 index 0000000..16a5e45 --- /dev/null +++ b/fastimport/reftracker.py @@ -0,0 +1,68 @@ +# Copyright (C) 2009 Canonical Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + + +"""Tracker of refs.""" + +from __future__ import absolute_import + + +class RefTracker(object): + + def __init__(self): + # Head tracking: last ref, last id per ref & map of commit ids to ref*s* + self.last_ref = None + self.last_ids = {} + self.heads = {} + + def dump_stats(self, note): + self._show_stats_for(self.last_ids, "last-ids", note=note) + self._show_stats_for(self.heads, "heads", note=note) + + def clear(self): + self.last_ids.clear() + self.heads.clear() + + def track_heads(self, cmd): + """Track the repository heads given a CommitCommand. + + :param cmd: the CommitCommand + :return: the list of parents in terms of commit-ids + """ + # Get the true set of parents + if cmd.from_ is not None: + parents = [cmd.from_] + else: + last_id = self.last_ids.get(cmd.ref) + if last_id is not None: + parents = [last_id] + else: + parents = [] + parents.extend(cmd.merges) + + # Track the heads + self.track_heads_for_ref(cmd.ref, cmd.id, parents) + return parents + + def track_heads_for_ref(self, cmd_ref, cmd_id, parents=None): + if parents is not None: + for parent in parents: + if parent in self.heads: + del self.heads[parent] + self.heads.setdefault(cmd_id, set()).add(cmd_ref) + self.last_ids[cmd_ref] = cmd_id + self.last_ref = cmd_ref + + diff --git a/fastimport/tests/__init__.py b/fastimport/tests/__init__.py index ae5acb7..01a681b 100644 --- a/fastimport/tests/__init__.py +++ b/fastimport/tests/__init__.py @@ -26,6 +26,7 @@ def test_suite(): 'test_dates', 'test_errors', 'test_filter_processor', + 'test_info_processor', 'test_helpers', 'test_parser', ] diff --git a/fastimport/tests/test_info_processor.py b/fastimport/tests/test_info_processor.py new file mode 100644 index 0000000..43dd50b --- /dev/null +++ b/fastimport/tests/test_info_processor.py @@ -0,0 +1,77 @@ +# Copyright (C) 2018 Jelmer Vernooij +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +"""Test InfoProcessor""" +from io import BytesIO + +try: + from StringIO import StringIO +except ImportError: + from io import StringIO + +from unittest import TestCase + +from fastimport import ( + parser, + ) + +from fastimport.processors import ( + info_processor, + ) + +simple_fast_import_stream = b"""commit refs/heads/master +mark :1 +committer Jelmer Vernooij <jelmer@samba.org> 1299718135 +0100 +data 7 +initial + +""" + +class TestFastImportInfo(TestCase): + + def test_simple(self): + stream = BytesIO(simple_fast_import_stream) + outf = StringIO() + proc = info_processor.InfoProcessor(outf=outf) + p = parser.ImportParser(stream) + proc.process(p.iter_commands) + + self.maxDiff = None + self.assertMultiLineEqual(outf.getvalue(), """Command counts: +\t0\tblob +\t0\tcheckpoint +\t1\tcommit +\t0\tfeature +\t0\tprogress +\t0\treset +\t0\ttag +File command counts: +\t0\tfilemodify +\t0\tfiledelete +\t0\tfilecopy +\t0\tfilerename +\t0\tfiledeleteall +Parent counts: +\t1\tparents-0 +\t0\ttotal revisions merged +Commit analysis: +\tno\tblobs referenced by SHA +\tno\texecutables +\tno\tseparate authors found +\tno\tsymlinks +Head analysis: +\t:1\trefs/heads/master +Merges: +""") |