summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIan Clatworthy <ian.clatworthy@canonical.com>2009-03-15 21:22:09 +1000
committerIan Clatworthy <ian.clatworthy@canonical.com>2009-03-15 21:22:09 +1000
commitb661d0c646d3f3f35bc6f669d3f24239c5eeb11c (patch)
treeb92d78fbeeb347ebfa8fa0086355757e636d8275
parentd18526c6e5bcc0bc6c63684cceef36f5b76cf85e (diff)
downloadbzr-fastimport-b661d0c646d3f3f35bc6f669d3f24239c5eeb11c.tar.gz
make the file-id cache optional and branch-ref aware
-rw-r--r--bzr_commit_handler.py32
-rw-r--r--cache_manager.py62
-rw-r--r--processors/generic_processor.py27
3 files changed, 71 insertions, 50 deletions
diff --git a/bzr_commit_handler.py b/bzr_commit_handler.py
index be371e9..f33470c 100644
--- a/bzr_commit_handler.py
+++ b/bzr_commit_handler.py
@@ -45,6 +45,7 @@ class GenericCommitHandler(processor.CommitHandler):
self.cache_mgr = cache_mgr
self.rev_store = rev_store
self.verbose = verbose
+ self.branch_ref = command.ref
def pre_process_files(self):
"""Prepare for committing."""
@@ -147,12 +148,17 @@ class GenericCommitHandler(processor.CommitHandler):
is_new = True if the file_id is newly created
"""
try:
- id = self.cache_mgr.file_ids[path]
+ id = self.cache_mgr.fetch_file_id(self.branch_ref, path)
return id, False
except KeyError:
- id = generate_ids.gen_file_id(path)
- self.cache_mgr.file_ids[path] = id
- self.debug("Generated new file id %s for '%s'", id, path)
+ # Not in the cache, try the inventory
+ id = self.basis_inventory.path2id(path)
+ if id is None:
+ # Doesn't exist yet so create it
+ id = generate_ids.gen_file_id(path)
+ self.debug("Generated new file id %s for '%s' in '%s'",
+ id, path, self.branch_ref)
+ self.cache_mgr.store_file_id(self.branch_ref, path, id)
return id, True
def bzr_file_id(self, path):
@@ -316,7 +322,7 @@ class GenericCommitHandler(processor.CommitHandler):
self.record_delete(new_path, inv[new_file_id])
ie.revision = self.revision_id
self.record_rename(old_path, new_path, file_id, ie)
- self.cache_mgr.rename_path(old_path, new_path)
+ self.cache_mgr.rename_path(self.branch_ref, old_path, new_path)
# The revision-id for this entry will be/has been updated and
# that means the loader then needs to know what the "new" text is.
@@ -343,7 +349,11 @@ class InventoryCommitHandler(GenericCommitHandler):
def pre_process_files(self):
super(InventoryCommitHandler, self).pre_process_files()
- # Seed the inventory from the previous one
+ # Seed the inventory from the previous one. Note that
+ # the parent class version of pre_process_files() has
+ # already set the right basis_inventory for this branch
+ # but we need to copy it in order to mutate it safely
+ # without corrupting the cached inventory value.
if len(self.parents) == 0:
self.inventory = self.basis_inventory
else:
@@ -415,8 +425,12 @@ class InventoryCommitHandler(GenericCommitHandler):
del inv[fileid]
else:
# already added by some other name?
- if dirname in self.cache_mgr.file_ids:
- parent_id = self.cache_mgr.file_ids[dirname]
+ try:
+ parent_id = self.cache_mgr.fetch_file_id(self.branch_ref,
+ dirname)
+ except KeyError:
+ pass
+ else:
del inv[parent_id].children[basename]
except KeyError:
self._warn_unless_in_merges(fileid, path)
@@ -430,7 +444,7 @@ class InventoryCommitHandler(GenericCommitHandler):
else:
raise
try:
- self.cache_mgr.delete_path(path)
+ self.cache_mgr.delete_path(self.branch_ref, path)
except KeyError:
pass
diff --git a/cache_manager.py b/cache_manager.py
index 527b288..cf28bee 100644
--- a/cache_manager.py
+++ b/cache_manager.py
@@ -44,8 +44,9 @@ class CacheManager(object):
# we need to keep all of these but they are small
self.revision_ids = {}
- # path -> file-ids - as generated
- self.file_ids = {}
+ # (path, branch_ref) -> file-ids - as generated.
+ # (Use store_file_id/fetch_fileid methods rather than direct access.)
+ self._file_ids = {}
# Head tracking: last ref, last id per ref & map of commit ids to ref*s*
self.last_ref = None
@@ -67,22 +68,26 @@ class CacheManager(object):
note("Cache statistics:")
self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
self._show_stats_for(self.revision_ids, "revision-ids", note=note)
- self._show_stats_for(self.file_ids, "file-ids", note=note)
+ self._show_stats_for(self._file_ids, "file-ids", note=note,
+ tuple_key=True)
# These aren't interesting so omit from the output, at least for now
#self._show_stats_for(self._blobs, "other blobs", note=note)
#self._show_stats_for(self.last_ids, "last-ids", note=note)
#self._show_stats_for(self.heads, "heads", note=note)
- def _show_stats_for(self, dict, label, note=trace.note):
+ def _show_stats_for(self, dict, label, note=trace.note, tuple_key=False):
"""Dump statistics about a given dictionary.
By the key and value need to support len().
"""
count = len(dict)
- size = sum(map(len, dict.keys()))
+ if tuple_key:
+ size = sum(map(len, (''.join(k) for k in dict.keys())))
+ else:
+ size = sum(map(len, dict.keys()))
size += sum(map(len, dict.values()))
kbytes = size * 1.0 / 1024
- note(" %-12s: %8.1fs kB (%d %s)" % (label, kbytes, count,
+ note(" %-12s: %8.1f kB (%d %s)" % (label, kbytes, count,
helpers.single_plural(count, "item", "items")))
def clear_all(self):
@@ -90,7 +95,7 @@ class CacheManager(object):
self._blobs.clear()
self._sticky_blobs.clear()
self.revision_ids.clear()
- self.file_ids.clear()
+ self._file_ids.clear()
self.last_ids.clear()
self.heads.clear()
self.inventories.clear()
@@ -110,18 +115,47 @@ class CacheManager(object):
except KeyError:
return self._blobs.pop(id)
- def delete_path(self, path):
+ def store_file_id(self, branch_ref, path, id):
+ """Store the path to file-id mapping for a branch."""
+ key = self._fileid_key(path, branch_ref)
+ self._file_ids[key] = id
+
+ def fetch_file_id(self, branch_ref, path):
+ """Lookup the file-id for a path in a branch.
+
+ Raises KeyError if unsuccessful.
+ """
+ key = self._fileid_key(path, branch_ref)
+ return self._file_ids[key]
+
+ def _fileid_key(self, path, branch_ref):
+ return (path, branch_ref)
+
+ def delete_path(self, branch_ref, path):
"""Remove a path from caches."""
- # we actually want to remember what file-id we gave a path,
- # even when that file is deleted, so doing nothing is correct
+ # We actually want to remember what file-id we gave a path,
+ # even when that file is deleted, so doing nothing is correct.
+ # It's quite possible for a path to be deleted twice where
+ # the first time is in a merge branch (but the same branch_ref)
+ # and the second time is when that branch is merged to mainline.
pass
- def rename_path(self, old_path, new_path):
+ def rename_path(self, branch_ref, old_path, new_path):
"""Rename a path in the caches."""
# In this case, we need to forget the file-id we gave a path,
- # otherwise, we'll get duplicate file-ids in the repository.
- self.file_ids[new_path] = self.file_ids[old_path]
- del self.file_ids[old_path]
+ # otherwise, we'll get duplicate file-ids in the repository
+ # if a new file is created at the old path.
+ old_key = self._fileid_key(old_path, branch_ref)
+ new_key = self._fileid_key(new_path, branch_ref)
+ try:
+ old_file_id = self._file_ids[old_key]
+ except KeyError:
+ # The old_key has already been removed, most likely
+ # in a merge branch.
+ pass
+ else:
+ self._file_ids[new_key] = old_file_id
+ del self._file_ids[old_key]
def track_heads(self, cmd):
"""Track the repository heads given a CommitCommand.
diff --git a/processors/generic_processor.py b/processors/generic_processor.py
index fe83d5a..904b911 100644
--- a/processors/generic_processor.py
+++ b/processors/generic_processor.py
@@ -414,15 +414,10 @@ class GenericProcessor(processor.ImportProcessor):
pass
self.cache_mgr._blobs = {}
self._revision_count += 1
- # If we're finished getting back to where we were,
- # load the file-ids cache
- if self._revision_count == self.skip_total:
- self._gen_file_ids_cache()
return
if self.first_incremental_commit:
self.first_incremental_commit = None
parents = self.cache_mgr.track_heads(cmd)
- self._gen_file_ids_cache(parents)
# 'Commit' the revision and report progress
handler = self.commit_handler_factory(cmd, self.cache_mgr,
@@ -442,28 +437,6 @@ class GenericProcessor(processor.ImportProcessor):
self._revision_count)
self.checkpoint_handler(None)
- def _gen_file_ids_cache(self, revs=False):
- """Generate the file-id cache by searching repository inventories.
- """
- # Get the interesting revisions - the heads
- if revs:
- head_ids = revs
- else:
- head_ids = self.cache_mgr.heads.keys()
- revision_ids = [self.cache_mgr.revision_ids[h] for h in head_ids]
-
- # Update the fileid cache
- file_ids = {}
- for revision_id in revision_ids:
- self.note("Collecting file-ids for head %s ..." % revision_id)
- inv = self.repo.revision_tree(revision_id).inventory
- # Cache the inventories while we're at it
- self.cache_mgr.inventories[revision_id] = inv
- for path, ie in inv.iter_entries():
- file_ids[path] = ie.file_id
- self.cache_mgr.file_ids = file_ids
- self.note("Generated the file-ids cache - %d entries" % len(file_ids))
-
def report_progress(self, details=''):
if self._revision_count % self.progress_every == 0:
if self.total_commits is not None: