Import processors from bzr-fastimport.

author: Jelmer Vernooij <jelmer@samba.org> 2010-09-06 01:41:23 +0200
committer: Jelmer Vernooij <jelmer@samba.org> 2010-09-06 01:41:23 +0200
commit: cd6fd7746de85f146226b4cf98920f2a4a5529c3 (patch)
tree: 037fb81987dd10817b9a6cf4978bfe11a0f50469
parent: dca7e002c69f04c52182aa16ffa1ea230d967055 (diff)
parent: c60068bd0035e829a1e11a55d9bd6fe2cde65a32 (diff)
download: python-fastimport-cd6fd7746de85f146226b4cf98920f2a4a5529c3.tar.gz
11 files changed, 2254 insertions, 0 deletions
diff --git a/fastimport/helpers.py b/fastimport/helpers.py
index 05cce6f..8e9a383 100644
--- a/fastimport/helpers.py
+++ b/fastimport/helpers.py
@@ -92,4 +92,33 @@ def binary_stream(stream):
     return stream
 
 
+def common_directory(paths):
+    """Find the deepest common directory of a list of paths.
+
+    :return: if no paths are provided, None is returned;
+      if there is no common directory, '' is returned;
+      otherwise the common directory with a trailing / is returned.
+    """
+    from bzrlib import osutils
+    def get_dir_with_slash(path):
+        if path == '' or path.endswith('/'):
+            return path
+        else:
+            dirname, basename = osutils.split(path)
+            if dirname == '':
+                return dirname
+            else:
+                return dirname + '/'
+
+    if not paths:
+        return None
+    elif len(paths) == 1:
+        return get_dir_with_slash(paths[0])
+    else:
+        common = common_path(paths[0], paths[1])
+        for path in paths[2:]:
+            common = common_path(common, path)
+        return get_dir_with_slash(common)
+
+
 
diff --git a/fastimport/processors/__init__.py b/fastimport/processors/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/fastimport/processors/__init__.py
diff --git a/fastimport/processors/filter_processor.py b/fastimport/processors/filter_processor.py
new file mode 100644
index 0000000..0c8506e
--- /dev/null
+++ b/fastimport/processors/filter_processor.py
@@ -0,0 +1,298 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""Import processor that filters the input (and doesn't import)."""
+
+
+from bzrlib import osutils
+from bzrlib.trace import (
+    warning,
+    )
+from fastimport import (
+    commands,
+    helpers,
+    processor,
+    )
+
+
+class FilterProcessor(processor.ImportProcessor):
+    """An import processor that filters the input to include/exclude objects.
+
+    No changes to the current repository are made.
+
+    Here are the supported parameters:
+
+    * include_paths - a list of paths that commits must change in order to
+      be kept in the output stream
+
+    * exclude_paths - a list of paths that should not appear in the output
+      stream
+    """
+
+    known_params = [
+        'include_paths',
+        'exclude_paths',
+        ]
+
+    def pre_process(self):
+        self.includes = self.params.get('include_paths')
+        self.excludes = self.params.get('exclude_paths')
+        # What's the new root, if any
+        self.new_root = helpers.common_directory(self.includes)
+        # Buffer of blobs until we know we need them: mark -> cmd
+        self.blobs = {}
+        # These are the commits we've output so far
+        self.interesting_commits = set()
+        # Map of commit-id to list of parents
+        self.parents = {}
+
+    def pre_handler(self, cmd):
+        self.command = cmd
+        # Should this command be included in the output or not?
+        self.keep = False
+        # Blobs to dump into the output before dumping the command itself
+        self.referenced_blobs = []
+
+    def post_handler(self, cmd):
+        if not self.keep:
+            return
+        # print referenced blobs and the command
+        for blob_id in self.referenced_blobs:
+            self._print_command(self.blobs[blob_id])
+        self._print_command(self.command)
+
+    def progress_handler(self, cmd):
+        """Process a ProgressCommand."""
+        # These always pass through
+        self.keep = True
+
+    def blob_handler(self, cmd):
+        """Process a BlobCommand."""
+        # These never pass through directly. We buffer them and only
+        # output them if referenced by an interesting command.
+        self.blobs[cmd.id] = cmd
+        self.keep = False
+
+    def checkpoint_handler(self, cmd):
+        """Process a CheckpointCommand."""
+        # These always pass through
+        self.keep = True
+
+    def commit_handler(self, cmd):
+        """Process a CommitCommand."""
+        # These pass through if they meet the filtering conditions
+        interesting_filecmds = self._filter_filecommands(cmd.file_iter)
+        if interesting_filecmds:
+            # If all we have is a single deleteall, skip this commit
+            if len(interesting_filecmds) == 1 and isinstance(
+                interesting_filecmds[0], commands.FileDeleteAllCommand):
+                pass
+            else:
+                # Remember just the interesting file commands
+                self.keep = True
+                cmd.file_iter = iter(interesting_filecmds)
+
+                # Record the referenced blobs
+                for fc in interesting_filecmds:
+                    if isinstance(fc, commands.FileModifyCommand):
+                        if (fc.dataref is not None and
+                            fc.kind != 'directory'):
+                            self.referenced_blobs.append(fc.dataref)
+
+                # Update from and merges to refer to commits in the output
+                cmd.from_ = self._find_interesting_from(cmd.from_)
+                cmd.merges = self._find_interesting_merges(cmd.merges)
+                self.interesting_commits.add(cmd.id)
+
+        # Keep track of the parents
+        if cmd.from_ and cmd.merges:
+            parents = [cmd.from_] + cmd.merges
+        elif cmd.from_:
+            parents = [cmd.from_]
+        else:
+            parents = None
+        self.parents[":" + cmd.mark] = parents
+
+    def reset_handler(self, cmd):
+        """Process a ResetCommand."""
+        if cmd.from_ is None:
+            # We pass through resets that init a branch because we have to
+            # assume the branch might be interesting.
+            self.keep = True
+        else:
+            # Keep resets if they indirectly reference something we kept
+            cmd.from_ = self._find_interesting_from(cmd.from_)
+            self.keep = cmd.from_ is not None
+
+    def tag_handler(self, cmd):
+        """Process a TagCommand."""
+        # Keep tags if they indirectly reference something we kept
+        cmd.from_ = self._find_interesting_from(cmd.from_)
+        self.keep = cmd.from_ is not None
+
+    def feature_handler(self, cmd):
+        """Process a FeatureCommand."""
+        feature = cmd.feature_name
+        if feature not in commands.FEATURE_NAMES:
+            self.warning("feature %s is not supported - parsing may fail"
+                % (feature,))
+        # These always pass through
+        self.keep = True
+
+    def _print_command(self, cmd):
+        """Wrapper to avoid adding unnecessary blank lines."""
+        text = repr(cmd)
+        self.outf.write(text)
+        if not text.endswith("\n"):
+            self.outf.write("\n")
+
+    def _filter_filecommands(self, filecmd_iter):
+        """Return the filecommands filtered by includes & excludes.
+        
+        :return: a list of FileCommand objects
+        """
+        if self.includes is None and self.excludes is None:
+            return list(filecmd_iter())
+
+        # Do the filtering, adjusting for the new_root
+        result = []
+        for fc in filecmd_iter():
+            if (isinstance(fc, commands.FileModifyCommand) or
+                isinstance(fc, commands.FileDeleteCommand)):
+                if self._path_to_be_kept(fc.path):
+                    fc.path = self._adjust_for_new_root(fc.path)
+                else:
+                    continue
+            elif isinstance(fc, commands.FileDeleteAllCommand):
+                pass
+            elif isinstance(fc, commands.FileRenameCommand):
+                fc = self._convert_rename(fc)
+            elif isinstance(fc, commands.FileCopyCommand):
+                fc = self._convert_copy(fc)
+            else:
+                warning("cannot handle FileCommands of class %s - ignoring",
+                        fc.__class__)
+                continue
+            if fc is not None:
+                result.append(fc)
+        return result
+
+    def _path_to_be_kept(self, path):
+        """Does the given path pass the filtering criteria?"""
+        if self.excludes and (path in self.excludes
+                or osutils.is_inside_any(self.excludes, path)):
+            return False
+        if self.includes:
+            return (path in self.includes
+                or osutils.is_inside_any(self.includes, path))
+        return True
+
+    def _adjust_for_new_root(self, path):
+        """Adjust a path given the new root directory of the output."""
+        if self.new_root is None:
+            return path
+        elif path.startswith(self.new_root):
+            return path[len(self.new_root):]
+        else:
+            return path
+
+    def _find_interesting_parent(self, commit_ref):
+        while True:
+            if commit_ref in self.interesting_commits:
+                return commit_ref
+            parents = self.parents.get(commit_ref)
+            if not parents:
+                return None
+            commit_ref = parents[0]
+
+    def _find_interesting_from(self, commit_ref):
+        if commit_ref is None:
+            return None
+        return self._find_interesting_parent(commit_ref)
+
+    def _find_interesting_merges(self, commit_refs):
+        if commit_refs is None:
+            return None
+        merges = []
+        for commit_ref in commit_refs:
+            parent = self._find_interesting_parent(commit_ref)
+            if parent is not None:
+                merges.append(parent)
+        if merges:
+            return merges
+        else:
+            return None
+
+    def _convert_rename(self, fc):
+        """Convert a FileRenameCommand into a new FileCommand.
+        
+        :return: None if the rename is being ignored, otherwise a
+          new FileCommand based on the whether the old and new paths
+          are inside or outside of the interesting locations.
+          """
+        old = fc.old_path
+        new = fc.new_path
+        keep_old = self._path_to_be_kept(old)
+        keep_new = self._path_to_be_kept(new)
+        if keep_old and keep_new:
+            fc.old_path = self._adjust_for_new_root(old)
+            fc.new_path = self._adjust_for_new_root(new)
+            return fc
+        elif keep_old:
+            # The file has been renamed to a non-interesting location.
+            # Delete it!
+            old = self._adjust_for_new_root(old)
+            return commands.FileDeleteCommand(old)
+        elif keep_new:
+            # The file has been renamed into an interesting location
+            # We really ought to add it but we don't currently buffer
+            # the contents of all previous files and probably never want
+            # to. Maybe fast-import-info needs to be extended to
+            # remember all renames and a config file can be passed
+            # into here ala fast-import?
+            warning("cannot turn rename of %s into an add of %s yet" %
+                (old, new))
+        return None
+
+    def _convert_copy(self, fc):
+        """Convert a FileCopyCommand into a new FileCommand.
+        
+        :return: None if the copy is being ignored, otherwise a
+          new FileCommand based on the whether the source and destination
+          paths are inside or outside of the interesting locations.
+          """
+        src = fc.src_path
+        dest = fc.dest_path
+        keep_src = self._path_to_be_kept(src)
+        keep_dest = self._path_to_be_kept(dest)
+        if keep_src and keep_dest:
+            fc.src_path = self._adjust_for_new_root(src)
+            fc.dest_path = self._adjust_for_new_root(dest)
+            return fc
+        elif keep_src:
+            # The file has been copied to a non-interesting location.
+            # Ignore it!
+            return None
+        elif keep_dest:
+            # The file has been copied into an interesting location
+            # We really ought to add it but we don't currently buffer
+            # the contents of all previous files and probably never want
+            # to. Maybe fast-import-info needs to be extended to
+            # remember all copies and a config file can be passed
+            # into here ala fast-import?
+            warning("cannot turn copy of %s into an add of %s yet" %
+                (src, dest))
+        return None
diff --git a/fastimport/processors/info_processor.py b/fastimport/processors/info_processor.py
new file mode 100644
index 0000000..6d8c15d
--- /dev/null
+++ b/fastimport/processors/info_processor.py
@@ -0,0 +1,281 @@
+# Copyright (C) 2008 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""Import processor that dump stats about the input (and doesn't import)."""
+
+
+from fastimport import (
+    commands,
+    processor,
+    reftracker,
+    )
+from fastimport.helpers import (
+    invert_dict,
+    invert_dictset,
+    )
+
+
+class InfoProcessor(processor.ImportProcessor):
+    """An import processor that dumps statistics about the input.
+
+    No changes to the current repository are made.
+
+    As well as providing useful information about an import
+    stream before importing it, this processor is useful for
+    benchmarking the speed at which data can be extracted from
+    the source.
+    """
+
+    def __init__(self, params=None, verbose=0, outf=None):
+        processor.ImportProcessor.__init__(self, params, verbose,
+            outf=outf)
+
+    def pre_process(self):
+        self.note("Collecting statistics ...")
+        # Init statistics
+        self.cmd_counts = {}
+        for cmd in commands.COMMAND_NAMES:
+            self.cmd_counts[cmd] = 0
+        self.file_cmd_counts = {}
+        for fc in commands.FILE_COMMAND_NAMES:
+            self.file_cmd_counts[fc] = 0
+        self.parent_counts = {}
+        self.max_parent_count = 0
+        self.committers = set()
+        self.separate_authors_found = False
+        self.symlinks_found = False
+        self.executables_found = False
+        self.sha_blob_references = False
+        self.lightweight_tags = 0
+        # Blob usage tracking
+        self.blobs = {}
+        for usage in ['new', 'used', 'unknown', 'unmarked']:
+            self.blobs[usage] = set()
+        self.blob_ref_counts = {}
+        # Head tracking
+        self.reftracker = reftracker.RefTracker()
+        # Stuff to cache: a map from mark to # of times that mark is merged
+        self.merges = {}
+        # Stuff to cache: these are maps from mark to sets
+        self.rename_old_paths = {}
+        self.copy_source_paths = {}
+
+    def post_process(self):
+        # Dump statistics
+        cmd_names = commands.COMMAND_NAMES
+        fc_names = commands.FILE_COMMAND_NAMES
+        self._dump_stats_group("Command counts",
+            [(c, self.cmd_counts[c]) for c in cmd_names], str)
+        self._dump_stats_group("File command counts", 
+            [(c, self.file_cmd_counts[c]) for c in fc_names], str)
+
+        # Commit stats
+        if self.cmd_counts['commit']:
+            p_items = []
+            for i in xrange(0, self.max_parent_count + 1):
+                if i in self.parent_counts:
+                    count = self.parent_counts[i]
+                    p_items.append(("parents-%d" % i, count))
+            merges_count = len(self.merges.keys())
+            p_items.append(('total revisions merged', merges_count))
+            flags = {
+                'separate authors found': self.separate_authors_found,
+                'executables': self.executables_found,
+                'symlinks': self.symlinks_found,
+                'blobs referenced by SHA': self.sha_blob_references,
+                }
+            self._dump_stats_group("Parent counts", p_items, str)
+            self._dump_stats_group("Commit analysis", flags.iteritems(), _found)
+            heads = invert_dictset(self.reftracker.heads)
+            self._dump_stats_group("Head analysis", heads.iteritems(), None,
+                                    _iterable_as_config_list)
+            # note("\t%d\t%s" % (len(self.committers), 'unique committers'))
+            self._dump_stats_group("Merges", self.merges.iteritems(), None)
+            # We only show the rename old path and copy source paths when -vv
+            # (verbose=2) is specified. The output here for mysql's data can't
+            # be parsed currently so this bit of code needs more work anyhow ..
+            if self.verbose >= 2:
+                self._dump_stats_group("Rename old paths",
+                    self.rename_old_paths.iteritems(), len,
+                    _iterable_as_config_list)
+                self._dump_stats_group("Copy source paths",
+                    self.copy_source_paths.iteritems(), len,
+                    _iterable_as_config_list)
+
+        # Blob stats
+        if self.cmd_counts['blob']:
+            # In verbose mode, don't list every blob used
+            if self.verbose:
+                del self.blobs['used']
+            self._dump_stats_group("Blob usage tracking",
+                self.blobs.iteritems(), len, _iterable_as_config_list)
+        if self.blob_ref_counts:
+            blobs_by_count = invert_dict(self.blob_ref_counts)
+            blob_items = blobs_by_count.items()
+            blob_items.sort()
+            self._dump_stats_group("Blob reference counts",
+                blob_items, len, _iterable_as_config_list)
+
+        # Other stats
+        if self.cmd_counts['reset']:
+            reset_stats = {
+                'lightweight tags': self.lightweight_tags,
+                }
+            self._dump_stats_group("Reset analysis", reset_stats.iteritems())
+
+    def _dump_stats_group(self, title, items, normal_formatter=None,
+        verbose_formatter=None):
+        """Dump a statistics group.
+        
+        In verbose mode, do so as a config file so
+        that other processors can load the information if they want to.
+        :param normal_formatter: the callable to apply to the value
+          before displaying it in normal mode
+        :param verbose_formatter: the callable to apply to the value
+          before displaying it in verbose mode
+        """
+        if self.verbose:
+            self.outf.write("[%s]\n" % (title,))
+            for name, value in items:
+                if verbose_formatter is not None:
+                    value = verbose_formatter(value)
+                if type(name) == str:
+                    name = name.replace(' ', '-')
+                self.outf.write("%s = %s\n" % (name, value))
+            self.outf.write("\n")
+        else:
+            self.outf.write("%s:\n" % (title,))
+            for name, value in items:
+                if normal_formatter is not None:
+                    value = normal_formatter(value)
+                self.outf.write("\t%s\t%s\n" % (value, name))
+
+    def progress_handler(self, cmd):
+        """Process a ProgressCommand."""
+        self.cmd_counts[cmd.name] += 1
+
+    def blob_handler(self, cmd):
+        """Process a BlobCommand."""
+        self.cmd_counts[cmd.name] += 1
+        if cmd.mark is None:
+            self.blobs['unmarked'].add(cmd.id)
+        else:
+            self.blobs['new'].add(cmd.id)
+            # Marks can be re-used so remove it from used if already there.
+            # Note: we definitely do NOT want to remove it from multi if
+            # it's already in that set.
+            try:
+                self.blobs['used'].remove(cmd.id)
+            except KeyError:
+                pass
+
+    def checkpoint_handler(self, cmd):
+        """Process a CheckpointCommand."""
+        self.cmd_counts[cmd.name] += 1
+
+    def commit_handler(self, cmd):
+        """Process a CommitCommand."""
+        self.cmd_counts[cmd.name] += 1
+        self.committers.add(cmd.committer)
+        if cmd.author is not None:
+            self.separate_authors_found = True
+        for fc in cmd.file_iter():
+            self.file_cmd_counts[fc.name] += 1
+            if isinstance(fc, commands.FileModifyCommand):
+                if fc.is_executable:
+                    self.executables_found = True
+                if fc.kind == commands.SYMLINK_KIND:
+                    self.symlinks_found = True
+                if fc.dataref is not None:
+                    if fc.dataref[0] == ':':
+                        self._track_blob(fc.dataref)
+                    else:
+                        self.sha_blob_references = True
+            elif isinstance(fc, commands.FileRenameCommand):
+                self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path)
+            elif isinstance(fc, commands.FileCopyCommand):
+                self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path)
+
+        # Track the heads
+        parents = self.reftracker.track_heads(cmd)
+
+        # Track the parent counts
+        parent_count = len(parents)
+        if self.parent_counts.has_key(parent_count):
+            self.parent_counts[parent_count] += 1
+        else:
+            self.parent_counts[parent_count] = 1
+            if parent_count > self.max_parent_count:
+                self.max_parent_count = parent_count
+
+        # Remember the merges
+        if cmd.merges:
+            #self.merges.setdefault(cmd.ref, set()).update(cmd.merges)
+            for merge in cmd.merges:
+                if merge in self.merges:
+                    self.merges[merge] += 1
+                else:
+                    self.merges[merge] = 1
+
+    def reset_handler(self, cmd):
+        """Process a ResetCommand."""
+        self.cmd_counts[cmd.name] += 1
+        if cmd.ref.startswith('refs/tags/'):
+            self.lightweight_tags += 1
+        else:
+            if cmd.from_ is not None:
+                self.reftracker.track_heads_for_ref(
+                    cmd.ref, cmd.from_)
+
+    def tag_handler(self, cmd):
+        """Process a TagCommand."""
+        self.cmd_counts[cmd.name] += 1
+
+    def feature_handler(self, cmd):
+        """Process a FeatureCommand."""
+        self.cmd_counts[cmd.name] += 1
+        feature = cmd.feature_name
+        if feature not in commands.FEATURE_NAMES:
+            self.warning("feature %s is not supported - parsing may fail"
+                % (feature,))
+
+    def _track_blob(self, mark):
+        if mark in self.blob_ref_counts:
+            self.blob_ref_counts[mark] += 1
+            pass
+        elif mark in self.blobs['used']:
+            self.blob_ref_counts[mark] = 2
+            self.blobs['used'].remove(mark)
+        elif mark in self.blobs['new']:
+            self.blobs['used'].add(mark)
+            self.blobs['new'].remove(mark)
+        else:
+            self.blobs['unknown'].add(mark)
+
+def _found(b):
+    """Format a found boolean as a string."""
+    return ['no', 'found'][b]
+
+def _iterable_as_config_list(s):
+    """Format an iterable as a sequence of comma-separated strings.
+    
+    To match what ConfigObj expects, a single item list has a trailing comma.
+    """
+    items = sorted(s)
+    if len(items) == 1:
+        return "%s," % (items[0],)
+    else:
+        return ", ".join(items)
diff --git a/fastimport/processors/query_processor.py b/fastimport/processors/query_processor.py
new file mode 100644
index 0000000..76250e5
--- /dev/null
+++ b/fastimport/processors/query_processor.py
@@ -0,0 +1,96 @@
+# Copyright (C) 2008 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""Import processor that queries the input (and doesn't import)."""
+
+
+from fastimport import (
+    commands,
+    processor,
+    )
+
+
+class QueryProcessor(processor.ImportProcessor):
+    """An import processor that queries the input.
+
+    No changes to the current repository are made.
+    """
+
+    known_params = commands.COMMAND_NAMES + commands.FILE_COMMAND_NAMES + \
+        ['commit-mark']
+
+    def __init__(self, params=None, verbose=False):
+        processor.ImportProcessor.__init__(self, params, verbose)
+        self.parsed_params = {}
+        self.interesting_commit = None
+        self._finished = False
+        if params:
+            if 'commit-mark' in params:
+                self.interesting_commit = params['commit-mark']
+                del params['commit-mark']
+            for name, value in params.iteritems():
+                if value == 1:
+                    # All fields
+                    fields = None
+                else:
+                    fields = value.split(',')
+                self.parsed_params[name] = fields
+
+    def pre_handler(self, cmd):
+        """Hook for logic before each handler starts."""
+        if self._finished:
+            return
+        if self.interesting_commit and cmd.name == 'commit':
+            if cmd.mark == self.interesting_commit:
+                print cmd.to_string()
+                self._finished = True
+            return
+        if self.parsed_params.has_key(cmd.name):
+            fields = self.parsed_params[cmd.name]
+            str = cmd.dump_str(fields, self.parsed_params, self.verbose)
+            print "%s" % (str,)
+
+    def progress_handler(self, cmd):
+        """Process a ProgressCommand."""
+        pass
+
+    def blob_handler(self, cmd):
+        """Process a BlobCommand."""
+        pass
+
+    def checkpoint_handler(self, cmd):
+        """Process a CheckpointCommand."""
+        pass
+
+    def commit_handler(self, cmd):
+        """Process a CommitCommand."""
+        for fc in cmd.file_iter():
+            pass
+
+    def reset_handler(self, cmd):
+        """Process a ResetCommand."""
+        pass
+
+    def tag_handler(self, cmd):
+        """Process a TagCommand."""
+        pass
+
+    def feature_handler(self, cmd):
+        """Process a FeatureCommand."""
+        feature = cmd.feature_name
+        if feature not in commands.FEATURE_NAMES:
+            self.warning("feature %s is not supported - parsing may fail"
+                % (feature,))
diff --git a/fastimport/reftracker.py b/fastimport/reftracker.py
new file mode 100644
index 0000000..3862180
--- /dev/null
+++ b/fastimport/reftracker.py
@@ -0,0 +1,67 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+"""Tracker of refs."""
+
+
+class RefTracker(object):
+
+    def __init__(self):
+        # Head tracking: last ref, last id per ref & map of commit ids to ref*s*
+        self.last_ref = None
+        self.last_ids = {}
+        self.heads = {}
+
+    def dump_stats(self, note):
+        self._show_stats_for(self.last_ids, "last-ids", note=note)
+        self._show_stats_for(self.heads, "heads", note=note)
+
+    def clear(self):
+        self.last_ids.clear()
+        self.heads.clear()
+
+    def track_heads(self, cmd):
+        """Track the repository heads given a CommitCommand.
+
+        :param cmd: the CommitCommand
+        :return: the list of parents in terms of commit-ids
+        """
+        # Get the true set of parents
+        if cmd.from_ is not None:
+            parents = [cmd.from_]
+        else:
+            last_id = self.last_ids.get(cmd.ref)
+            if last_id is not None:
+                parents = [last_id]
+            else:
+                parents = []
+        parents.extend(cmd.merges)
+
+        # Track the heads
+        self.track_heads_for_ref(cmd.ref, cmd.id, parents)
+        return parents
+
+    def track_heads_for_ref(self, cmd_ref, cmd_id, parents=None):
+        if parents is not None:
+            for parent in parents:
+                if parent in self.heads:
+                    del self.heads[parent]
+        self.heads.setdefault(cmd_id, set()).add(cmd_ref)
+        self.last_ids[cmd_ref] = cmd_id
+        self.last_ref = cmd_ref
+
+
diff --git a/fastimport/tests/__init__.py b/fastimport/tests/__init__.py
index 2d80157..3a8e69f 100644
--- a/fastimport/tests/__init__.py
+++ b/fastimport/tests/__init__.py
@@ -26,6 +26,10 @@ def test_suite():
     names = [
         'test_commands',
         'test_errors',
+        'test_filter_processor',
+        'test_helpers',
+        'test_head_tracking',
+        'test_parser',
         ]
     module_names = ['fastimport.tests.' + name for name in names]
     result = unittest.TestSuite()
diff --git a/fastimport/tests/test_filter_processor.py b/fastimport/tests/test_filter_processor.py
new file mode 100644
index 0000000..af107d3
--- /dev/null
+++ b/fastimport/tests/test_filter_processor.py
@@ -0,0 +1,879 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""Test FilterProcessor"""
+
+from cStringIO import StringIO
+
+from testtools import TestCase
+
+from fastimport import (
+    parser,
+    )
+
+from fastimport.processors import (
+    filter_processor,
+    )
+
+
+# A sample input stream containing all (top level) import commands
+_SAMPLE_ALL = \
+"""blob
+mark :1
+data 4
+foo
+commit refs/heads/master
+mark :2
+committer Joe <joe@example.com> 1234567890 +1000
+data 14
+Initial import
+M 644 :1 COPYING
+checkpoint
+progress first import done
+reset refs/remote/origin/master
+from :2
+tag v0.1
+from :2
+tagger Joe <joe@example.com> 1234567890 +1000
+data 12
+release v0.1
+"""
+
+
+# A sample input stream creating the following tree:
+#
+#  NEWS
+#  doc/README.txt
+#  doc/index.txt
+_SAMPLE_WITH_DIR = \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 doc/README.txt
+blob
+mark :2
+data 17
+Life
+is
+good ...
+commit refs/heads/master
+mark :101
+committer a <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :2 NEWS
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :101
+M 644 :3 doc/README.txt
+M 644 :4 doc/index.txt
+"""
+
+
+class TestCaseWithFiltering(TestCase):
+
+    def assertFiltering(self, input, params, expected):
+        outf = StringIO()
+        proc = filter_processor.FilterProcessor(
+            params=params)
+        proc.outf = outf
+        s = StringIO(input)
+        p = parser.ImportParser(s)
+        proc.process(p.iter_commands)
+        out = outf.getvalue()
+        self.assertEquals(expected, out)
+
+
+class TestNoFiltering(TestCaseWithFiltering):
+
+    def test_params_not_given(self):
+        self.assertFiltering(_SAMPLE_ALL, None, _SAMPLE_ALL)
+
+    def test_params_are_none(self):
+        params = {'include_paths': None, 'exclude_paths': None}
+        self.assertFiltering(_SAMPLE_ALL, params, _SAMPLE_ALL)
+
+
+class TestIncludePaths(TestCaseWithFiltering):
+
+    def test_file_in_root(self):
+        # Things to note:
+        # * only referenced blobs are retained
+        # * from clause is dropped from the first command
+        params = {'include_paths': ['NEWS']}
+        self.assertFiltering(_SAMPLE_WITH_DIR, params, \
+"""blob
+mark :2
+data 17
+Life
+is
+good ...
+commit refs/heads/master
+mark :101
+committer a <b@c> 1234798653 +0000
+data 8
+test
+ing
+M 644 :2 NEWS
+""")
+
+    def test_file_in_subdir(self):
+        #  Additional things to note:
+        # * new root: path is now index.txt, not doc/index.txt
+        # * other files changed in matching commits are excluded
+        params = {'include_paths': ['doc/index.txt']}
+        self.assertFiltering(_SAMPLE_WITH_DIR, params, \
+"""blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+M 644 :4 index.txt
+""")
+
+    def test_file_with_changes(self):
+        #  Additional things to note:
+        # * from updated to reference parents in the output
+        params = {'include_paths': ['doc/README.txt']}
+        self.assertFiltering(_SAMPLE_WITH_DIR, params, \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 README.txt
+blob
+mark :3
+data 19
+Welcome!
+my friend
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 README.txt
+""")
+
+    def test_subdir(self):
+        params = {'include_paths': ['doc/']}
+        self.assertFiltering(_SAMPLE_WITH_DIR, params, \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 README.txt
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 README.txt
+M 644 :4 index.txt
+""")
+
+    def test_multiple_files_in_subdir(self):
+        # The new root should be the subdrectory
+        params = {'include_paths': ['doc/README.txt', 'doc/index.txt']}
+        self.assertFiltering(_SAMPLE_WITH_DIR, params, \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 README.txt
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 README.txt
+M 644 :4 index.txt
+""")
+
+
+class TestExcludePaths(TestCaseWithFiltering):
+
+    def test_file_in_root(self):
+        params = {'exclude_paths': ['NEWS']}
+        self.assertFiltering(_SAMPLE_WITH_DIR, params, \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 doc/README.txt
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 doc/README.txt
+M 644 :4 doc/index.txt
+""")
+
+    def test_file_in_subdir(self):
+        params = {'exclude_paths': ['doc/README.txt']}
+        self.assertFiltering(_SAMPLE_WITH_DIR, params, \
+"""blob
+mark :2
+data 17
+Life
+is
+good ...
+commit refs/heads/master
+mark :101
+committer a <b@c> 1234798653 +0000
+data 8
+test
+ing
+M 644 :2 NEWS
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :101
+M 644 :4 doc/index.txt
+""")
+
+    def test_subdir(self):
+        params = {'exclude_paths': ['doc/']}
+        self.assertFiltering(_SAMPLE_WITH_DIR, params, \
+"""blob
+mark :2
+data 17
+Life
+is
+good ...
+commit refs/heads/master
+mark :101
+committer a <b@c> 1234798653 +0000
+data 8
+test
+ing
+M 644 :2 NEWS
+""")
+
+    def test_multple_files(self):
+        params = {'exclude_paths': ['doc/index.txt', 'NEWS']}
+        self.assertFiltering(_SAMPLE_WITH_DIR, params, \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 doc/README.txt
+blob
+mark :3
+data 19
+Welcome!
+my friend
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 doc/README.txt
+""")
+
+
+class TestIncludeAndExcludePaths(TestCaseWithFiltering):
+
+    def test_included_dir_and_excluded_file(self):
+        params = {'include_paths': ['doc/'], 'exclude_paths': ['doc/index.txt']}
+        self.assertFiltering(_SAMPLE_WITH_DIR, params, \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 README.txt
+blob
+mark :3
+data 19
+Welcome!
+my friend
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 README.txt
+""")
+
+
+# A sample input stream creating the following tree:
+#
+#  NEWS
+#  doc/README.txt
+#  doc/index.txt
+#
+# It then renames doc/README.txt => doc/README
+_SAMPLE_WITH_RENAME_INSIDE = _SAMPLE_WITH_DIR + \
+"""commit refs/heads/master
+mark :103
+committer d <b@c> 1234798653 +0000
+data 10
+move intro
+from :102
+R doc/README.txt doc/README
+"""
+
+# A sample input stream creating the following tree:
+#
+#  NEWS
+#  doc/README.txt
+#  doc/index.txt
+#
+# It then renames doc/README.txt => README
+_SAMPLE_WITH_RENAME_TO_OUTSIDE = _SAMPLE_WITH_DIR + \
+"""commit refs/heads/master
+mark :103
+committer d <b@c> 1234798653 +0000
+data 10
+move intro
+from :102
+R doc/README.txt README
+"""
+
+# A sample input stream creating the following tree:
+#
+#  NEWS
+#  doc/README.txt
+#  doc/index.txt
+#
+# It then renames NEWS => doc/NEWS
+_SAMPLE_WITH_RENAME_TO_INSIDE = _SAMPLE_WITH_DIR + \
+"""commit refs/heads/master
+mark :103
+committer d <b@c> 1234798653 +0000
+data 10
+move intro
+from :102
+R NEWS doc/NEWS
+"""
+
+class TestIncludePathsWithRenames(TestCaseWithFiltering):
+
+    def test_rename_all_inside(self):
+        # These rename commands ought to be kept but adjusted for the new root
+        params = {'include_paths': ['doc/']}
+        self.assertFiltering(_SAMPLE_WITH_RENAME_INSIDE, params, \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 README.txt
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 README.txt
+M 644 :4 index.txt
+commit refs/heads/master
+mark :103
+committer d <b@c> 1234798653 +0000
+data 10
+move intro
+from :102
+R README.txt README
+""")
+
+    def test_rename_to_outside(self):
+        # These rename commands become deletes
+        params = {'include_paths': ['doc/']}
+        self.assertFiltering(_SAMPLE_WITH_RENAME_TO_OUTSIDE, params, \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 README.txt
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 README.txt
+M 644 :4 index.txt
+commit refs/heads/master
+mark :103
+committer d <b@c> 1234798653 +0000
+data 10
+move intro
+from :102
+D README.txt
+""")
+
+    def test_rename_to_inside(self):
+        # This ought to create a new file but doesn't yet
+        params = {'include_paths': ['doc/']}
+        self.assertFiltering(_SAMPLE_WITH_RENAME_TO_INSIDE, params, \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 README.txt
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 README.txt
+M 644 :4 index.txt
+""")
+
+
+# A sample input stream creating the following tree:
+#
+#  NEWS
+#  doc/README.txt
+#  doc/index.txt
+#
+# It then copies doc/README.txt => doc/README
+_SAMPLE_WITH_COPY_INSIDE = _SAMPLE_WITH_DIR + \
+"""commit refs/heads/master
+mark :103
+committer d <b@c> 1234798653 +0000
+data 10
+move intro
+from :102
+C doc/README.txt doc/README
+"""
+
+# A sample input stream creating the following tree:
+#
+#  NEWS
+#  doc/README.txt
+#  doc/index.txt
+#
+# It then copies doc/README.txt => README
+_SAMPLE_WITH_COPY_TO_OUTSIDE = _SAMPLE_WITH_DIR + \
+"""commit refs/heads/master
+mark :103
+committer d <b@c> 1234798653 +0000
+data 10
+move intro
+from :102
+C doc/README.txt README
+"""
+
+# A sample input stream creating the following tree:
+#
+#  NEWS
+#  doc/README.txt
+#  doc/index.txt
+#
+# It then copies NEWS => doc/NEWS
+_SAMPLE_WITH_COPY_TO_INSIDE = _SAMPLE_WITH_DIR + \
+"""commit refs/heads/master
+mark :103
+committer d <b@c> 1234798653 +0000
+data 10
+move intro
+from :102
+C NEWS doc/NEWS
+"""
+
+
+class TestIncludePathsWithCopies(TestCaseWithFiltering):
+
+    def test_copy_all_inside(self):
+        # These copy commands ought to be kept but adjusted for the new root
+        params = {'include_paths': ['doc/']}
+        self.assertFiltering(_SAMPLE_WITH_COPY_INSIDE, params, \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 README.txt
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 README.txt
+M 644 :4 index.txt
+commit refs/heads/master
+mark :103
+committer d <b@c> 1234798653 +0000
+data 10
+move intro
+from :102
+C README.txt README
+""")
+
+    def test_copy_to_outside(self):
+        # This can be ignored
+        params = {'include_paths': ['doc/']}
+        self.assertFiltering(_SAMPLE_WITH_COPY_TO_OUTSIDE, params, \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 README.txt
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 README.txt
+M 644 :4 index.txt
+""")
+
+    def test_copy_to_inside(self):
+        # This ought to create a new file but doesn't yet
+        params = {'include_paths': ['doc/']}
+        self.assertFiltering(_SAMPLE_WITH_COPY_TO_INSIDE, params, \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 README.txt
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 README.txt
+M 644 :4 index.txt
+""")
+
+
+# A sample input stream with deleteall's creating the following tree:
+#
+#  NEWS
+#  doc/README.txt
+#  doc/index.txt
+_SAMPLE_WITH_DELETEALL = \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+deleteall
+M 644 :1 doc/README.txt
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+deleteall
+M 644 :3 doc/README.txt
+M 644 :4 doc/index.txt
+"""
+
+
+class TestIncludePathsWithDeleteAll(TestCaseWithFiltering):
+
+    def test_deleteall(self):
+        params = {'include_paths': ['doc/index.txt']}
+        self.assertFiltering(_SAMPLE_WITH_DELETEALL, params, \
+"""blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+deleteall
+M 644 :4 index.txt
+""")
+
+
+_SAMPLE_WITH_TAGS = _SAMPLE_WITH_DIR + \
+"""tag v0.1
+from :100
+tagger d <b@c> 1234798653 +0000
+data 12
+release v0.1
+tag v0.2
+from :102
+tagger d <b@c> 1234798653 +0000
+data 12
+release v0.2
+"""
+
+class TestIncludePathsWithTags(TestCaseWithFiltering):
+
+    def test_tag_retention(self):
+        # If a tag references a commit with a parent we kept,
+        # keep the tag but adjust 'from' accordingly.
+        # Otherwise, delete the tag command.
+        params = {'include_paths': ['NEWS']}
+        self.assertFiltering(_SAMPLE_WITH_TAGS, params, \
+"""blob
+mark :2
+data 17
+Life
+is
+good ...
+commit refs/heads/master
+mark :101
+committer a <b@c> 1234798653 +0000
+data 8
+test
+ing
+M 644 :2 NEWS
+tag v0.2
+from :101
+tagger d <b@c> 1234798653 +0000
+data 12
+release v0.2
+""")
+
+
+_SAMPLE_WITH_RESETS = _SAMPLE_WITH_DIR + \
+"""reset refs/heads/foo
+reset refs/heads/bar
+from :102
+"""
+
+class TestIncludePathsWithResets(TestCaseWithFiltering):
+
+    def test_reset_retention(self):
+        # Resets init'ing a branch (without a from) are passed through.
+        # If a reset references a commit with a parent we kept,
+        # keep the reset but adjust 'from' accordingly.
+        params = {'include_paths': ['NEWS']}
+        self.assertFiltering(_SAMPLE_WITH_RESETS, params, \
+"""blob
+mark :2
+data 17
+Life
+is
+good ...
+commit refs/heads/master
+mark :101
+committer a <b@c> 1234798653 +0000
+data 8
+test
+ing
+M 644 :2 NEWS
+reset refs/heads/foo
+reset refs/heads/bar
+from :101
+""")
diff --git a/fastimport/tests/test_head_tracking.py b/fastimport/tests/test_head_tracking.py
new file mode 100644
index 0000000..7a1ba64
--- /dev/null
+++ b/fastimport/tests/test_head_tracking.py
@@ -0,0 +1,260 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""Test tracking of heads"""
+
+from cStringIO import StringIO
+
+from fastimport import (
+    commands,
+    parser,
+    )
+
+import testtools
+
+from fastimport.reftracker import (
+    RefTracker,
+    )
+
+
+# A sample input stream that only adds files to a branch
+_SAMPLE_MAINLINE = \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 doc/README.txt
+blob
+mark :2
+data 17
+Life
+is
+good ...
+commit refs/heads/master
+mark :101
+committer a <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :2 NEWS
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :101
+M 644 :3 doc/README.txt
+M 644 :4 doc/index.txt
+"""
+
+# A sample input stream that adds files to two branches
+_SAMPLE_TWO_HEADS = \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 doc/README.txt
+blob
+mark :2
+data 17
+Life
+is
+good ...
+commit refs/heads/mybranch
+mark :101
+committer a <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :2 NEWS
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 doc/README.txt
+M 644 :4 doc/index.txt
+"""
+
+# A sample input stream that adds files to two branches
+_SAMPLE_TWO_BRANCHES_MERGED = \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 doc/README.txt
+blob
+mark :2
+data 17
+Life
+is
+good ...
+commit refs/heads/mybranch
+mark :101
+committer a <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :2 NEWS
+blob
+mark :3
+data 19
+Welcome!
+my friend
+blob
+mark :4
+data 11
+== Docs ==
+commit refs/heads/master
+mark :102
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+M 644 :3 doc/README.txt
+M 644 :4 doc/index.txt
+commit refs/heads/master
+mark :103
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :102
+merge :101
+D doc/index.txt
+"""
+
+# A sample input stream that contains a reset
+_SAMPLE_RESET = \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 doc/README.txt
+reset refs/remotes/origin/master
+from :100
+"""
+
+# A sample input stream that contains a reset and more commits
+_SAMPLE_RESET_WITH_MORE_COMMITS = \
+"""blob
+mark :1
+data 9
+Welcome!
+commit refs/heads/master
+mark :100
+committer a <b@c> 1234798653 +0000
+data 4
+test
+M 644 :1 doc/README.txt
+reset refs/remotes/origin/master
+from :100
+commit refs/remotes/origin/master
+mark :101
+committer d <b@c> 1234798653 +0000
+data 8
+test
+ing
+from :100
+D doc/README.txt
+"""
+
+class TestHeadTracking(testtools.TestCase):
+
+    def assertHeads(self, input, expected):
+        s = StringIO(input)
+        p = parser.ImportParser(s)
+        reftracker = RefTracker()
+        for cmd in p.iter_commands():
+            if isinstance(cmd, commands.CommitCommand):
+                reftracker.track_heads(cmd)
+                # eat the file commands
+                list(cmd.file_iter())
+            elif isinstance(cmd, commands.ResetCommand):
+                if cmd.from_ is not None:
+                    reftracker.track_heads_for_ref(cmd.ref, cmd.from_)
+        self.assertEqual(reftracker.heads, expected)
+
+    def test_mainline(self):
+        self.assertHeads(_SAMPLE_MAINLINE, {
+            ':102': set(['refs/heads/master']),
+            })
+
+    def test_two_heads(self):
+        self.assertHeads(_SAMPLE_TWO_HEADS, {
+            ':101': set(['refs/heads/mybranch']),
+            ':102': set(['refs/heads/master']),
+            })
+
+    def test_two_branches_merged(self):
+        self.assertHeads(_SAMPLE_TWO_BRANCHES_MERGED, {
+            ':103': set(['refs/heads/master']),
+            })
+
+    def test_reset(self):
+        self.assertHeads(_SAMPLE_RESET, {
+            ':100': set(['refs/heads/master', 'refs/remotes/origin/master']),
+            })
+
+    def test_reset_with_more_commits(self):
+        self.assertHeads(_SAMPLE_RESET_WITH_MORE_COMMITS, {
+            ':101': set(['refs/remotes/origin/master']),
+            })
diff --git a/fastimport/tests/test_helpers.py b/fastimport/tests/test_helpers.py
new file mode 100644
index 0000000..639e436
--- /dev/null
+++ b/fastimport/tests/test_helpers.py
@@ -0,0 +1,56 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""Test the helper functions."""
+
+import testtools
+
+from fastimport import (
+    helpers,
+    )
+
+
+class TestCommonDirectory(testtools.TestCase):
+
+    def test_no_paths(self):
+        c = helpers.common_directory(None)
+        self.assertEqual(c, None)
+        c = helpers.common_directory([])
+        self.assertEqual(c, None)
+
+    def test_one_path(self):
+        c = helpers.common_directory(['foo'])
+        self.assertEqual(c, '')
+        c = helpers.common_directory(['foo/'])
+        self.assertEqual(c, 'foo/')
+        c = helpers.common_directory(['foo/bar'])
+        self.assertEqual(c, 'foo/')
+
+    def test_two_paths(self):
+        c = helpers.common_directory(['foo', 'bar'])
+        self.assertEqual(c, '')
+        c = helpers.common_directory(['foo/', 'bar'])
+        self.assertEqual(c, '')
+        c = helpers.common_directory(['foo/', 'foo/bar'])
+        self.assertEqual(c, 'foo/')
+        c = helpers.common_directory(['foo/bar/x', 'foo/bar/y'])
+        self.assertEqual(c, 'foo/bar/')
+        c = helpers.common_directory(['foo/bar/aa_x', 'foo/bar/aa_y'])
+        self.assertEqual(c, 'foo/bar/')
+
+    def test_lots_of_paths(self):
+        c = helpers.common_directory(['foo/bar/x', 'foo/bar/y', 'foo/bar/z'])
+        self.assertEqual(c, 'foo/bar/')
diff --git a/fastimport/tests/test_parser.py b/fastimport/tests/test_parser.py
new file mode 100644
index 0000000..267ec13
--- /dev/null
+++ b/fastimport/tests/test_parser.py
@@ -0,0 +1,284 @@
+# Copyright (C) 2008 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""Test the Import parsing"""
+
+import StringIO
+
+import testtools
+
+from fastimport import (
+    errors,
+    parser,
+    )
+
+
+class TestLineBasedParser(testtools.TestCase):
+
+    def test_push_line(self):
+        s = StringIO.StringIO("foo\nbar\nbaz\n")
+        p = parser.LineBasedParser(s)
+        self.assertEqual('foo', p.next_line())
+        self.assertEqual('bar', p.next_line())
+        p.push_line('bar')
+        self.assertEqual('bar', p.next_line())
+        self.assertEqual('baz', p.next_line())
+        self.assertEqual(None, p.next_line())
+
+    def test_read_bytes(self):
+        s = StringIO.StringIO("foo\nbar\nbaz\n")
+        p = parser.LineBasedParser(s)
+        self.assertEqual('fo', p.read_bytes(2))
+        self.assertEqual('o\nb', p.read_bytes(3))
+        self.assertEqual('ar', p.next_line())
+        # Test that the line buffer is ignored
+        p.push_line('bar')
+        self.assertEqual('baz', p.read_bytes(3))
+        # Test missing bytes
+        self.assertRaises(errors.MissingBytes, p.read_bytes, 10)
+
+    def test_read_until(self):
+        # TODO
+        return
+        s = StringIO.StringIO("foo\nbar\nbaz\nabc\ndef\nghi\n")
+        p = parser.LineBasedParser(s)
+        self.assertEqual('foo\nbar', p.read_until('baz'))
+        self.assertEqual('abc', p.next_line())
+        # Test that the line buffer is ignored
+        p.push_line('abc')
+        self.assertEqual('def', p.read_until('ghi'))
+        # Test missing terminator
+        self.assertRaises(errors.MissingTerminator, p.read_until('>>>'))
+
+
+# Sample text
+_sample_import_text = """
+progress completed
+# Test blob formats
+blob
+mark :1
+data 4
+aaaablob
+data 5
+bbbbb
+# Commit formats
+commit refs/heads/master
+mark :2
+committer bugs bunny <bugs@bunny.org> now
+data 14
+initial import
+M 644 inline README
+data 18
+Welcome from bugs
+commit refs/heads/master
+committer <bugs@bunny.org> now
+data 13
+second commit
+from :2
+M 644 inline README
+data 23
+Welcome from bugs, etc.
+# Miscellaneous
+checkpoint
+progress completed
+# Test a commit without sub-commands (bug #351717)
+commit refs/heads/master
+mark :3
+author <bugs@bunny.org> now
+committer <bugs@bunny.org> now
+data 20
+first commit, empty
+# Test a commit with a heredoc-style (delimited_data) messsage (bug #400960)
+commit refs/heads/master
+mark :4
+author <bugs@bunny.org> now
+committer <bugs@bunny.org> now
+data <<EOF
+Commit with heredoc-style message
+EOF
+# Test a "submodule"/tree-reference
+commit refs/heads/master
+mark :5
+author <bugs@bunny.org> now
+committer <bugs@bunny.org> now
+data 15
+submodule test
+M 160000 rev-id tree-id
+# Test features
+feature whatever
+feature foo=bar
+# Test commit with properties
+commit refs/heads/master
+mark :6
+committer <bugs@bunny.org> now
+data 18
+test of properties
+property p1
+property p2 5 hohum
+property p3 16 alpha
+beta
+gamma
+property p4 8 whatever
+# Test a commit with multiple authors
+commit refs/heads/master
+mark :7
+author Fluffy <fluffy@bunny.org> now
+author Daffy <daffy@duck.org> now
+author Donald <donald@duck.org> now
+committer <bugs@bunny.org> now
+data 17
+multi-author test
+"""
+
+
+class TestImportParser(testtools.TestCase):
+
+    def test_iter_commands(self):
+        s = StringIO.StringIO(_sample_import_text)
+        p = parser.ImportParser(s)
+        result = []
+        for cmd in p.iter_commands():
+            result.append(cmd)
+            if cmd.name == 'commit':
+                for fc in cmd.file_iter():
+                    result.append(fc)
+        self.assertEqual(len(result), 17)
+        cmd1 = result.pop(0)
+        self.assertEqual('progress', cmd1.name)
+        self.assertEqual('completed', cmd1.message)
+        cmd2 = result.pop(0)
+        self.assertEqual('blob', cmd2.name)
+        self.assertEqual('1', cmd2.mark)
+        self.assertEqual(':1', cmd2.id)
+        self.assertEqual('aaaa', cmd2.data)
+        self.assertEqual(4, cmd2.lineno)
+        cmd3 = result.pop(0)
+        self.assertEqual('blob', cmd3.name)
+        self.assertEqual('@7', cmd3.id)
+        self.assertEqual(None, cmd3.mark)
+        self.assertEqual('bbbbb', cmd3.data)
+        self.assertEqual(7, cmd3.lineno)
+        cmd4 = result.pop(0)
+        self.assertEqual('commit', cmd4.name)
+        self.assertEqual('2', cmd4.mark)
+        self.assertEqual(':2', cmd4.id)
+        self.assertEqual('initial import', cmd4.message)
+        self.assertEqual('bugs bunny', cmd4.committer[0])
+        self.assertEqual('bugs@bunny.org', cmd4.committer[1])
+        # FIXME: check timestamp and timezone as well
+        self.assertEqual(None, cmd4.author)
+        self.assertEqual(11, cmd4.lineno)
+        self.assertEqual('refs/heads/master', cmd4.ref)
+        self.assertEqual(None, cmd4.from_)
+        self.assertEqual([], cmd4.merges)
+        file_cmd1 = result.pop(0)
+        self.assertEqual('filemodify', file_cmd1.name)
+        self.assertEqual('README', file_cmd1.path)
+        self.assertEqual('file', file_cmd1.kind)
+        self.assertEqual(False, file_cmd1.is_executable)
+        self.assertEqual('Welcome from bugs\n', file_cmd1.data)
+        cmd5 = result.pop(0)
+        self.assertEqual('commit', cmd5.name)
+        self.assertEqual(None, cmd5.mark)
+        self.assertEqual('@19', cmd5.id)
+        self.assertEqual('second commit', cmd5.message)
+        self.assertEqual('', cmd5.committer[0])
+        self.assertEqual('bugs@bunny.org', cmd5.committer[1])
+        # FIXME: check timestamp and timezone as well
+        self.assertEqual(None, cmd5.author)
+        self.assertEqual(19, cmd5.lineno)
+        self.assertEqual('refs/heads/master', cmd5.ref)
+        self.assertEqual(':2', cmd5.from_)
+        self.assertEqual([], cmd5.merges)
+        file_cmd2 = result.pop(0)
+        self.assertEqual('filemodify', file_cmd2.name)
+        self.assertEqual('README', file_cmd2.path)
+        self.assertEqual('file', file_cmd2.kind)
+        self.assertEqual(False, file_cmd2.is_executable)
+        self.assertEqual('Welcome from bugs, etc.', file_cmd2.data)
+        cmd6 = result.pop(0)
+        self.assertEqual(cmd6.name, 'checkpoint')
+        cmd7 = result.pop(0)
+        self.assertEqual('progress', cmd7.name)
+        self.assertEqual('completed', cmd7.message)
+        cmd = result.pop(0)
+        self.assertEqual('commit', cmd.name)
+        self.assertEqual('3', cmd.mark)
+        self.assertEqual(None, cmd.from_)
+        cmd = result.pop(0)
+        self.assertEqual('commit', cmd.name)
+        self.assertEqual('4', cmd.mark)
+        self.assertEqual('Commit with heredoc-style message\n', cmd.message)
+        cmd = result.pop(0)
+        self.assertEqual('commit', cmd.name)
+        self.assertEqual('5', cmd.mark)
+        self.assertEqual('submodule test\n', cmd.message)
+        file_cmd1 = result.pop(0)
+        self.assertEqual('filemodify', file_cmd1.name)
+        self.assertEqual('tree-id', file_cmd1.path)
+        self.assertEqual('tree-reference', file_cmd1.kind)
+        self.assertEqual(False, file_cmd1.is_executable)
+        self.assertEqual("rev-id", file_cmd1.dataref)
+        cmd = result.pop(0)
+        self.assertEqual('feature', cmd.name)
+        self.assertEqual('whatever', cmd.feature_name)
+        self.assertEqual(None, cmd.value)
+        cmd = result.pop(0)
+        self.assertEqual('feature', cmd.name)
+        self.assertEqual('foo', cmd.feature_name)
+        self.assertEqual('bar', cmd.value)
+        cmd = result.pop(0)
+        self.assertEqual('commit', cmd.name)
+        self.assertEqual('6', cmd.mark)
+        self.assertEqual('test of properties', cmd.message)
+        self.assertEqual({
+            'p1': None,
+            'p2': u'hohum',
+            'p3': u'alpha\nbeta\ngamma',
+            'p4': u'whatever',
+            }, cmd.properties)
+        cmd = result.pop(0)
+        self.assertEqual('commit', cmd.name)
+        self.assertEqual('7', cmd.mark)
+        self.assertEqual('multi-author test', cmd.message)
+        self.assertEqual('', cmd.committer[0])
+        self.assertEqual('bugs@bunny.org', cmd.committer[1])
+        self.assertEqual('Fluffy', cmd.author[0])
+        self.assertEqual('fluffy@bunny.org', cmd.author[1])
+        self.assertEqual('Daffy', cmd.more_authors[0][0])
+        self.assertEqual('daffy@duck.org', cmd.more_authors[0][1])
+        self.assertEqual('Donald', cmd.more_authors[1][0])
+        self.assertEqual('donald@duck.org', cmd.more_authors[1][1])
+
+
+class TestStringParsing(testtools.TestCase):
+
+    def test_unquote(self):
+        s = r'hello \"sweet\" wo\\r\tld'
+        self.assertEquals(r'hello "sweet" wo\r' + "\tld",
+            parser._unquote_c_string(s))
+
+
+class TestPathPairParsing(testtools.TestCase):
+
+    def test_path_pair_simple(self):
+        p = parser.ImportParser("")
+        self.assertEqual(['foo', 'bar'], p._path_pair("foo bar"))
+
+    def test_path_pair_spaces_in_first(self):
+        p = parser.ImportParser("")
+        self.assertEqual(['foo bar', 'baz'],
+            p._path_pair('"foo bar" baz'))
author	Jelmer Vernooij <jelmer@samba.org>	2010-09-06 01:41:23 +0200
committer	Jelmer Vernooij <jelmer@samba.org>	2010-09-06 01:41:23 +0200
commit	cd6fd7746de85f146226b4cf98920f2a4a5529c3 (patch)
tree	037fb81987dd10817b9a6cf4978bfe11a0f50469
parent	dca7e002c69f04c52182aa16ffa1ea230d967055 (diff)
parent	c60068bd0035e829a1e11a55d9bd6fe2cde65a32 (diff)
download	python-fastimport-cd6fd7746de85f146226b4cf98920f2a4a5529c3.tar.gz