faster export of revision range & improved diagnostics in fast-export

author: Ian Clatworthy <ian.clatworthy@internode.on.net> 2009-04-07 16:24:41 +1000
committer: Ian Clatworthy <ian.clatworthy@internode.on.net> 2009-04-07 16:24:41 +1000
commit: b5651405af553e4d8ebf94c8361c50fa4d56ac8b (patch)
tree: 7285f737445d84a3feead8417912de8b6b7a6e65
parent: 31a84bddfa624f4bb5687fd3acc04ca89f74df6c (diff)
download: bzr-fastimport-b5651405af553e4d8ebf94c8361c50fa4d56ac8b.tar.gz
3 files changed, 118 insertions, 71 deletions
diff --git a/__init__.py b/__init__.py
index 3ecbeb1..0620316 100644
--- a/__init__.py
+++ b/__init__.py
@@ -417,7 +417,7 @@ class cmd_fast_export(Command):
         exporter = bzr_exporter.BzrFastExporter(source,
             git_branch=git_branch, checkpoint=checkpoint,
             import_marks_file=import_marks, export_marks_file=export_marks,
-            revision=revision)
+            revision=revision, verbose=verbose)
         return exporter.run()
 
 
diff --git a/bzr_exporter.py b/bzr_exporter.py
index 399dea7..3ffe56a 100755
--- a/bzr_exporter.py
+++ b/bzr_exporter.py
@@ -32,56 +32,52 @@ import sys, time
 
 import bzrlib.branch
 import bzrlib.revision
-from bzrlib import errors as bazErrors
-from bzrlib.builtins import _get_revision_range
-from bzrlib.trace import note, warning
+from bzrlib import (
+    builtins,
+    errors as bazErrors,
+    progress,
+    trace,
+    )
 
 from bzrlib.plugins.fastimport import commands, helpers, marks_file
 
 
-try:
-    from bzrlib.log import _linear_view_revisions
-except ImportError:
-    # This is taken from log.py. As this function only landed in bzr 1.12, it's
-    # copied here so fast-export can work on earlier versions.
-    def _linear_view_revisions(branch, start_rev_id, end_rev_id):
-        """Calculate a sequence of revisions to view, newest to oldest.
-
-        :param start_rev_id: the lower revision-id
-        :param end_rev_id: the upper revision-id
-        :return: An iterator of (revision_id, dotted_revno, merge_depth) tuples.
-        :raises _StartNotLinearAncestor: if a start_rev_id is specified but
-          is not found walking the left-hand history
-        """
-        br_revno, br_rev_id = branch.last_revision_info()
-        repo = branch.repository
-        if start_rev_id is None and end_rev_id is None:
-            cur_revno = br_revno
-            for revision_id in repo.iter_reverse_revision_history(br_rev_id):
-                yield revision_id, str(cur_revno), 0
-                cur_revno -= 1
-        else:
-            if end_rev_id is None:
-                end_rev_id = br_rev_id
-            found_start = start_rev_id is None
-            for revision_id in repo.iter_reverse_revision_history(end_rev_id):
-                revno = branch.revision_id_to_dotted_revno(revision_id)
-                revno_str = '.'.join(str(n) for n in revno)
-                if not found_start and revision_id == start_rev_id:
-                    yield revision_id, revno_str, 0
-                    found_start = True
-                    break
-                else:
-                    yield revision_id, revno_str, 0
+# This is adapted from _linear_view_verisons in log.py in bzr 1.12.
+def _iter_linear_revisions(branch, start_rev_id, end_rev_id):
+    """Calculate a sequence of revisions, newest to oldest.
+
+    :param start_rev_id: the lower revision-id
+    :param end_rev_id: the upper revision-id
+    :return: An iterator of revision_ids
+    :raises ValueError: if a start_rev_id is specified but
+      is not found walking the left-hand history
+    """
+    br_revno, br_rev_id = branch.last_revision_info()
+    repo = branch.repository
+    if start_rev_id is None and end_rev_id is None:
+        for revision_id in repo.iter_reverse_revision_history(br_rev_id):
+            yield revision_id
+    else:
+        if end_rev_id is None:
+            end_rev_id = br_rev_id
+        found_start = start_rev_id is None
+        for revision_id in repo.iter_reverse_revision_history(end_rev_id):
+            if not found_start and revision_id == start_rev_id:
+                yield revision_id
+                found_start = True
+                break
             else:
-                if not found_start:
-                    raise _StartNotLinearAncestor()
+                yield revision_id
+        else:
+            if not found_start:
+                raise ValueError()
 
 
 class BzrFastExporter(object):
 
     def __init__(self, source, git_branch=None, checkpoint=-1,
-        import_marks_file=None, export_marks_file=None, revision=None):
+        import_marks_file=None, export_marks_file=None, revision=None,
+        verbose=False):
         self.source = source
         self.outf = helpers.binary_stream(sys.stdout)
         self.git_branch = git_branch
@@ -90,7 +86,18 @@ class BzrFastExporter(object):
         self.export_marks_file = export_marks_file
         self.revision = revision
         self.excluded_revisions = set()
+        self._multi_author_api_available = hasattr(bzrlib.revision.Revision,
+            'get_apparent_authors')
 
+        # Progress reporting stuff
+        self.verbose = verbose
+        if verbose:
+            self.progress_every = 100
+        else:
+            self.progress_every = 1000
+        self._start_time = time.time()
+
+        # Load the marks and initialise things accordingly
         self.revid_to_mark = {}
         self.branch_names = {}
         if self.import_marks_file:
@@ -102,24 +109,26 @@ class BzrFastExporter(object):
  
     def interesting_history(self):
         if self.revision:
-            rev1, rev2 = _get_revision_range(self.revision, self.branch,
-                "fast-export")
+            rev1, rev2 = builtins._get_revision_range(self.revision,
+                self.branch, "fast-export")
             start_rev_id = rev1.rev_id
             end_rev_id = rev2.rev_id
         else:
             start_rev_id = None
             end_rev_id = None
-        view_revisions = reversed(list(_linear_view_revisions(self.branch,
+        self.note("Calculating the revisions to include ...")
+        view_revisions = reversed(list(_iter_linear_revisions(self.branch,
             start_rev_id, end_rev_id)))
         # If a starting point was given, we need to later check that we don't
         # start emitting revisions from before that point. Collect the
         # revisions to exclude now ...
         if start_rev_id is not None:
             # The result is inclusive so skip the first (the oldest) one
-            uninteresting = [revid for revid, _, _ in _linear_view_revisions(
-                self.branch, None, start_rev_id)][1:]
+            self.note("Calculating the revisions to exclude ...")
+            uninteresting = list(_iter_linear_revisions(self.branch, None,
+                start_rev_id))[1:]
             self.excluded_revisions = set(uninteresting)
-        return [revid for revid, _, _ in view_revisions]
+        return list(view_revisions)
 
     def run(self):
         # Open the source
@@ -137,12 +146,42 @@ class BzrFastExporter(object):
 
         # Save the marks if requested
         self._save_marks()
+        self.dump_stats()
+
+    def note(self, msg, *args):
+        """Output a note but timestamp it."""
+        msg = "%s %s" % (self._time_of_day(), msg)
+        trace.note(msg, *args)
+
+    def warning(self, msg, *args):
+        """Output a warning but timestamp it."""
+        msg = "%s WARNING: %s" % (self._time_of_day(), msg)
+        trace.warning(msg, *args)
+
+    def _time_of_day(self):
+        """Time of day as a string."""
+        # Note: this is a separate method so tests can patch in a fixed value
+        return time.strftime("%H:%M:%S")
+
+    def report_progress(self, commit_count, details=''):
+        # Note: we can't easily give a total count here because we
+        # don't know how many merged revisions will need to be output
+        if commit_count and commit_count % self.progress_every == 0:
+            counts = "%d" % (commit_count,)
+            minutes = (time.time() - self._start_time) / 60
+            rate = commit_count * 1.0 / minutes
+            if rate > 10:
+                rate_str = "at %.0f/minute " % rate
+            else:
+                rate_str = "at %.1f/minute " % rate
+            self.note("%s commits exported %s%s" % (counts, rate_str, details))
 
-    def note(self, message):
-        note("bzr fast-export: %s" % message)
-
-    def warning(self, message):
-        warning("bzr fast-export: %s" % message)
+    def dump_stats(self):
+        time_required = progress.str_tdelta(time.time() - self._start_time)
+        rc = len(self.revid_to_mark)
+        self.note("Exported %d %s in %s",
+            rc, helpers.single_plural(rc, "revision", "revisions"),
+            time_required)
 
     def print_cmd(self, cmd):
         self.outf.write("%r\n" % cmd)
@@ -156,7 +195,8 @@ class BzrFastExporter(object):
     def is_empty_dir(self, tree, path):
         path_id = tree.path2id(path)
         if path_id == None:
-            warning("Skipping empty_dir detection, could not find path_id...")
+            self.warning("Skipping empty_dir detection - no file_id for %s" %
+                (path,))
             return False
 
         # Continue if path is not a directory
@@ -182,22 +222,14 @@ class BzrFastExporter(object):
             self.revid_to_mark[revid] = -1
             return
  
-        # Checkpoint if it's time for that
-        ncommits = len(self.revid_to_mark)
-        if (self.checkpoint > 0 and ncommits
-            and ncommits % self.checkpoint == 0):
-            self.note("Exported %i commits - checkpointing" % ncommits)
-            self._save_marks()
-            self.print_cmd(commands.CheckpointCommand())
-
         # Emit parents
         nparents = len(revobj.parent_ids)
         if nparents:
             for parent in revobj.parent_ids:
                 self.emit_commit(parent, git_branch)
-            ncommits = len(self.revid_to_mark)
 
         # Get the primary parent
+        ncommits = len(self.revid_to_mark)
         if nparents == 0:
             if ncommits:
                 # This is a parentless commit but it's not the first one
@@ -216,12 +248,24 @@ class BzrFastExporter(object):
         self.print_cmd(self._get_commit_command(git_ref, mark, revobj,
             file_cmds))
 
+        # Report progress and checkpoint if it's time for that
+        self.report_progress(ncommits)
+        if (self.checkpoint > 0 and ncommits
+            and ncommits % self.checkpoint == 0):
+            self.note("Exported %i commits - adding checkpoint to output"
+                % ncommits)
+            self._save_marks()
+            self.print_cmd(commands.CheckpointCommand())
+
     def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
         # Get the committer and author info
         committer = revobj.committer
         name, email = parseaddr(committer)
         committer_info = (name, email, revobj.timestamp, revobj.timezone)
-        author = revobj.get_apparent_author()
+        if self._multi_author_api_available:
+            author = revobj.get_apparent_authors()[0]
+        else:
+            author = revobj.get_apparent_author()
         if author != committer:
             name, email = parseaddr(author)
             author_info = (name, email, revobj.timestamp, revobj.timezone)
@@ -284,7 +328,8 @@ class BzrFastExporter(object):
                 self.note("Skipping empty dir %s in rev %s" % (oldpath,
                     revision_id))
                 continue
-            oldpath = self._adjust_path_for_renames(oldpath, renamed)
+            oldpath = self._adjust_path_for_renames(oldpath, renamed,
+                revision_id)
             renamed.append([oldpath, newpath])
             file_cmds.append(commands.FileRenameCommand(oldpath, newpath))
             if text_modified or meta_modified:
@@ -292,12 +337,12 @@ class BzrFastExporter(object):
 
         # Record deletes
         for path, id_, kind in changes.removed:
-            path = self._adjust_path_for_renames(path, renamed)
+            path = self._adjust_path_for_renames(path, renamed, revision_id)
             file_cmds.append(commands.FileDeleteCommand(path))
 
         # Map kind changes to a delete followed by an add
         for path, id_, kind1, kind2 in changes.kind_changed:
-            path = self._adjust_path_for_renames(path, renamed)
+            path = self._adjust_path_for_renames(path, renamed, revision_id)
             # IGC: I don't understand why a delete is needed here.
             # In fact, it seems harmful? If you uncomment this line,
             # please file a bug explaining why you needed to.
@@ -319,15 +364,17 @@ class BzrFastExporter(object):
                 continue
         return file_cmds
 
-    def _adjust_path_for_renames(self, path, renamed):
+    def _adjust_path_for_renames(self, path, renamed, revision_id):
         # If a previous rename is found, we should adjust the path
         for old, new in renamed:
             if path == old:
-                self.note("Changing path %s given rename to %s" % (path, new))
+                self.note("Changing path %s given rename to %s in revision %s"
+                    % (path, new, revision_id))
                 path = new
             elif path.startswith(old + '/'):
-                self.note("Adjusting path %s given rename of %s to %s" %
-                    (path, old, new))
+                self.note(
+                    "Adjusting path %s given rename of %s to %s in revision %s"
+                    % (path, old, new, revision_id))
                 path = path.replace(old + "/", new + "/")
         return path
 
diff --git a/processors/generic_processor.py b/processors/generic_processor.py
index e8bb1a0..8cac856 100644
--- a/processors/generic_processor.py
+++ b/processors/generic_processor.py
@@ -322,7 +322,7 @@ class GenericProcessor(processor.ImportProcessor):
             else:
                 self.warning("No working trees available to update")
 
-        # Dum pthe cache stats now because we clear it before the final pack
+        # Dump the cache stats now because we clear it before the final pack
         if self.verbose:
             self.cache_mgr.dump_stats()
         if self._original_max_pack_count:
author	Ian Clatworthy <ian.clatworthy@internode.on.net>	2009-04-07 16:24:41 +1000
committer	Ian Clatworthy <ian.clatworthy@internode.on.net>	2009-04-07 16:24:41 +1000
commit	b5651405af553e4d8ebf94c8361c50fa4d56ac8b (patch)
tree	7285f737445d84a3feead8417912de8b6b7a6e65
parent	31a84bddfa624f4bb5687fd3acc04ca89f74df6c (diff)
download	bzr-fastimport-b5651405af553e4d8ebf94c8361c50fa4d56ac8b.tar.gz