make the file-id cache optional and branch-ref aware

author: Ian Clatworthy <ian.clatworthy@canonical.com> 2009-03-15 21:22:09 +1000
committer: Ian Clatworthy <ian.clatworthy@canonical.com> 2009-03-15 21:22:09 +1000
commit: b661d0c646d3f3f35bc6f669d3f24239c5eeb11c (patch)
tree: b92d78fbeeb347ebfa8fa0086355757e636d8275
parent: d18526c6e5bcc0bc6c63684cceef36f5b76cf85e (diff)
download: bzr-fastimport-b661d0c646d3f3f35bc6f669d3f24239c5eeb11c.tar.gz
3 files changed, 71 insertions, 50 deletions
diff --git a/bzr_commit_handler.py b/bzr_commit_handler.py
index be371e9..f33470c 100644
--- a/bzr_commit_handler.py
+++ b/bzr_commit_handler.py
@@ -45,6 +45,7 @@ class GenericCommitHandler(processor.CommitHandler):
         self.cache_mgr = cache_mgr
         self.rev_store = rev_store
         self.verbose = verbose
+        self.branch_ref = command.ref
 
     def pre_process_files(self):
         """Prepare for committing."""
@@ -147,12 +148,17 @@ class GenericCommitHandler(processor.CommitHandler):
           is_new = True if the file_id is newly created
         """
         try:
-            id = self.cache_mgr.file_ids[path]
+            id = self.cache_mgr.fetch_file_id(self.branch_ref, path)
             return id, False
         except KeyError:
-            id = generate_ids.gen_file_id(path)
-            self.cache_mgr.file_ids[path] = id
-            self.debug("Generated new file id %s for '%s'", id, path)
+            # Not in the cache, try the inventory
+            id = self.basis_inventory.path2id(path)
+            if id is None:
+                # Doesn't exist yet so create it
+                id = generate_ids.gen_file_id(path)
+                self.debug("Generated new file id %s for '%s' in '%s'",
+                    id, path, self.branch_ref)
+            self.cache_mgr.store_file_id(self.branch_ref, path, id)
             return id, True
 
     def bzr_file_id(self, path):
@@ -316,7 +322,7 @@ class GenericCommitHandler(processor.CommitHandler):
             self.record_delete(new_path, inv[new_file_id])
         ie.revision = self.revision_id
         self.record_rename(old_path, new_path, file_id, ie)
-        self.cache_mgr.rename_path(old_path, new_path)
+        self.cache_mgr.rename_path(self.branch_ref, old_path, new_path)
 
         # The revision-id for this entry will be/has been updated and
         # that means the loader then needs to know what the "new" text is.
@@ -343,7 +349,11 @@ class InventoryCommitHandler(GenericCommitHandler):
     def pre_process_files(self):
         super(InventoryCommitHandler, self).pre_process_files()
 
-        # Seed the inventory from the previous one
+        # Seed the inventory from the previous one. Note that
+        # the parent class version of pre_process_files() has
+        # already set the right basis_inventory for this branch
+        # but we need to copy it in order to mutate it safely
+        # without corrupting the cached inventory value.
         if len(self.parents) == 0:
             self.inventory = self.basis_inventory
         else:
@@ -415,8 +425,12 @@ class InventoryCommitHandler(GenericCommitHandler):
                 del inv[fileid]
             else:
                 # already added by some other name?
-                if dirname in self.cache_mgr.file_ids:
-                    parent_id = self.cache_mgr.file_ids[dirname]
+                try:
+                    parent_id = self.cache_mgr.fetch_file_id(self.branch_ref,
+                        dirname)
+                except KeyError:
+                    pass
+                else:
                     del inv[parent_id].children[basename]
         except KeyError:
             self._warn_unless_in_merges(fileid, path)
@@ -430,7 +444,7 @@ class InventoryCommitHandler(GenericCommitHandler):
             else:
                 raise
         try:
-            self.cache_mgr.delete_path(path)
+            self.cache_mgr.delete_path(self.branch_ref, path)
         except KeyError:
             pass
 
diff --git a/cache_manager.py b/cache_manager.py
index 527b288..cf28bee 100644
--- a/cache_manager.py
+++ b/cache_manager.py
@@ -44,8 +44,9 @@ class CacheManager(object):
         # we need to keep all of these but they are small
         self.revision_ids = {}
 
-        # path -> file-ids - as generated
-        self.file_ids = {}
+        # (path, branch_ref) -> file-ids - as generated.
+        # (Use store_file_id/fetch_fileid methods rather than direct access.)
+        self._file_ids = {}
 
         # Head tracking: last ref, last id per ref & map of commit ids to ref*s*
         self.last_ref = None
@@ -67,22 +68,26 @@ class CacheManager(object):
         note("Cache statistics:")
         self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
         self._show_stats_for(self.revision_ids, "revision-ids", note=note)
-        self._show_stats_for(self.file_ids, "file-ids", note=note)
+        self._show_stats_for(self._file_ids, "file-ids", note=note,
+            tuple_key=True)
         # These aren't interesting so omit from the output, at least for now
         #self._show_stats_for(self._blobs, "other blobs", note=note)
         #self._show_stats_for(self.last_ids, "last-ids", note=note)
         #self._show_stats_for(self.heads, "heads", note=note)
 
-    def _show_stats_for(self, dict, label, note=trace.note):
+    def _show_stats_for(self, dict, label, note=trace.note, tuple_key=False):
         """Dump statistics about a given dictionary.
 
         By the key and value need to support len().
         """
         count = len(dict)
-        size = sum(map(len, dict.keys()))
+        if tuple_key:
+            size = sum(map(len, (''.join(k) for k in dict.keys())))
+        else:
+            size = sum(map(len, dict.keys()))
         size += sum(map(len, dict.values()))
         kbytes = size * 1.0 / 1024
-        note("    %-12s: %8.1fs kB (%d %s)" % (label, kbytes, count,
+        note("    %-12s: %8.1f kB (%d %s)" % (label, kbytes, count,
             helpers.single_plural(count, "item", "items")))
 
     def clear_all(self):
@@ -90,7 +95,7 @@ class CacheManager(object):
         self._blobs.clear()
         self._sticky_blobs.clear()
         self.revision_ids.clear()
-        self.file_ids.clear()
+        self._file_ids.clear()
         self.last_ids.clear()
         self.heads.clear()
         self.inventories.clear()
@@ -110,18 +115,47 @@ class CacheManager(object):
         except KeyError:
             return self._blobs.pop(id)
 
-    def delete_path(self, path):
+    def store_file_id(self, branch_ref, path, id):
+        """Store the path to file-id mapping for a branch."""
+        key = self._fileid_key(path, branch_ref)
+        self._file_ids[key] = id
+
+    def fetch_file_id(self, branch_ref, path):
+        """Lookup the file-id for a path in a branch.
+        
+        Raises KeyError if unsuccessful.
+        """
+        key = self._fileid_key(path, branch_ref)
+        return self._file_ids[key]
+
+    def _fileid_key(self, path, branch_ref):
+        return (path, branch_ref)
+
+    def delete_path(self, branch_ref, path):
         """Remove a path from caches."""
-        # we actually want to remember what file-id we gave a path,
-        # even when that file is deleted, so doing nothing is correct
+        # We actually want to remember what file-id we gave a path,
+        # even when that file is deleted, so doing nothing is correct.
+        # It's quite possible for a path to be deleted twice where
+        # the first time is in a merge branch (but the same branch_ref)
+        # and the second time is when that branch is merged to mainline.
         pass
 
-    def rename_path(self, old_path, new_path):
+    def rename_path(self, branch_ref, old_path, new_path):
         """Rename a path in the caches."""
         # In this case, we need to forget the file-id we gave a path,
-        # otherwise, we'll get duplicate file-ids in the repository.
-        self.file_ids[new_path] = self.file_ids[old_path]
-        del self.file_ids[old_path]
+        # otherwise, we'll get duplicate file-ids in the repository
+        # if a new file is created at the old path.
+        old_key = self._fileid_key(old_path, branch_ref)
+        new_key = self._fileid_key(new_path, branch_ref)
+        try:
+            old_file_id = self._file_ids[old_key]
+        except KeyError:
+            # The old_key has already been removed, most likely
+            # in a merge branch.
+            pass
+        else:
+            self._file_ids[new_key] = old_file_id
+            del self._file_ids[old_key]
 
     def track_heads(self, cmd):
         """Track the repository heads given a CommitCommand.
diff --git a/processors/generic_processor.py b/processors/generic_processor.py
index fe83d5a..904b911 100644
--- a/processors/generic_processor.py
+++ b/processors/generic_processor.py
@@ -414,15 +414,10 @@ class GenericProcessor(processor.ImportProcessor):
                 pass
             self.cache_mgr._blobs = {}
             self._revision_count += 1
-            # If we're finished getting back to where we were,
-            # load the file-ids cache
-            if self._revision_count == self.skip_total:
-                self._gen_file_ids_cache()
             return
         if self.first_incremental_commit:
             self.first_incremental_commit = None
             parents = self.cache_mgr.track_heads(cmd)
-            self._gen_file_ids_cache(parents)
 
         # 'Commit' the revision and report progress
         handler = self.commit_handler_factory(cmd, self.cache_mgr,
@@ -442,28 +437,6 @@ class GenericProcessor(processor.ImportProcessor):
                 self._revision_count)
             self.checkpoint_handler(None)
 
-    def _gen_file_ids_cache(self, revs=False):
-        """Generate the file-id cache by searching repository inventories.
-        """
-        # Get the interesting revisions - the heads
-        if revs:
-            head_ids = revs
-        else:
-            head_ids = self.cache_mgr.heads.keys()
-        revision_ids = [self.cache_mgr.revision_ids[h] for h in head_ids]
-
-        # Update the fileid cache
-        file_ids = {}
-        for revision_id in revision_ids:
-            self.note("Collecting file-ids for head %s ..." % revision_id)
-            inv = self.repo.revision_tree(revision_id).inventory
-            # Cache the inventories while we're at it
-            self.cache_mgr.inventories[revision_id] = inv
-            for path, ie in inv.iter_entries():
-                file_ids[path] = ie.file_id
-        self.cache_mgr.file_ids = file_ids
-        self.note("Generated the file-ids cache - %d entries" % len(file_ids))
-
     def report_progress(self, details=''):
         if self._revision_count % self.progress_every == 0:
             if self.total_commits is not None:
author	Ian Clatworthy <ian.clatworthy@canonical.com>	2009-03-15 21:22:09 +1000
committer	Ian Clatworthy <ian.clatworthy@canonical.com>	2009-03-15 21:22:09 +1000
commit	b661d0c646d3f3f35bc6f669d3f24239c5eeb11c (patch)
tree	b92d78fbeeb347ebfa8fa0086355757e636d8275
parent	d18526c6e5bcc0bc6c63684cceef36f5b76cf85e (diff)
download	bzr-fastimport-b661d0c646d3f3f35bc6f669d3f24239c5eeb11c.tar.gz