diff options
author | Ian Clatworthy <ian.clatworthy@canonical.com> | 2009-03-15 21:22:09 +1000 |
---|---|---|
committer | Ian Clatworthy <ian.clatworthy@canonical.com> | 2009-03-15 21:22:09 +1000 |
commit | b661d0c646d3f3f35bc6f669d3f24239c5eeb11c (patch) | |
tree | b92d78fbeeb347ebfa8fa0086355757e636d8275 | |
parent | d18526c6e5bcc0bc6c63684cceef36f5b76cf85e (diff) | |
download | bzr-fastimport-b661d0c646d3f3f35bc6f669d3f24239c5eeb11c.tar.gz |
make the file-id cache optional and branch-ref aware
-rw-r--r-- | bzr_commit_handler.py | 32 | ||||
-rw-r--r-- | cache_manager.py | 62 | ||||
-rw-r--r-- | processors/generic_processor.py | 27 |
3 files changed, 71 insertions, 50 deletions
diff --git a/bzr_commit_handler.py b/bzr_commit_handler.py index be371e9..f33470c 100644 --- a/bzr_commit_handler.py +++ b/bzr_commit_handler.py @@ -45,6 +45,7 @@ class GenericCommitHandler(processor.CommitHandler): self.cache_mgr = cache_mgr self.rev_store = rev_store self.verbose = verbose + self.branch_ref = command.ref def pre_process_files(self): """Prepare for committing.""" @@ -147,12 +148,17 @@ class GenericCommitHandler(processor.CommitHandler): is_new = True if the file_id is newly created """ try: - id = self.cache_mgr.file_ids[path] + id = self.cache_mgr.fetch_file_id(self.branch_ref, path) return id, False except KeyError: - id = generate_ids.gen_file_id(path) - self.cache_mgr.file_ids[path] = id - self.debug("Generated new file id %s for '%s'", id, path) + # Not in the cache, try the inventory + id = self.basis_inventory.path2id(path) + if id is None: + # Doesn't exist yet so create it + id = generate_ids.gen_file_id(path) + self.debug("Generated new file id %s for '%s' in '%s'", + id, path, self.branch_ref) + self.cache_mgr.store_file_id(self.branch_ref, path, id) return id, True def bzr_file_id(self, path): @@ -316,7 +322,7 @@ class GenericCommitHandler(processor.CommitHandler): self.record_delete(new_path, inv[new_file_id]) ie.revision = self.revision_id self.record_rename(old_path, new_path, file_id, ie) - self.cache_mgr.rename_path(old_path, new_path) + self.cache_mgr.rename_path(self.branch_ref, old_path, new_path) # The revision-id for this entry will be/has been updated and # that means the loader then needs to know what the "new" text is. @@ -343,7 +349,11 @@ class InventoryCommitHandler(GenericCommitHandler): def pre_process_files(self): super(InventoryCommitHandler, self).pre_process_files() - # Seed the inventory from the previous one + # Seed the inventory from the previous one. Note that + # the parent class version of pre_process_files() has + # already set the right basis_inventory for this branch + # but we need to copy it in order to mutate it safely + # without corrupting the cached inventory value. if len(self.parents) == 0: self.inventory = self.basis_inventory else: @@ -415,8 +425,12 @@ class InventoryCommitHandler(GenericCommitHandler): del inv[fileid] else: # already added by some other name? - if dirname in self.cache_mgr.file_ids: - parent_id = self.cache_mgr.file_ids[dirname] + try: + parent_id = self.cache_mgr.fetch_file_id(self.branch_ref, + dirname) + except KeyError: + pass + else: del inv[parent_id].children[basename] except KeyError: self._warn_unless_in_merges(fileid, path) @@ -430,7 +444,7 @@ class InventoryCommitHandler(GenericCommitHandler): else: raise try: - self.cache_mgr.delete_path(path) + self.cache_mgr.delete_path(self.branch_ref, path) except KeyError: pass diff --git a/cache_manager.py b/cache_manager.py index 527b288..cf28bee 100644 --- a/cache_manager.py +++ b/cache_manager.py @@ -44,8 +44,9 @@ class CacheManager(object): # we need to keep all of these but they are small self.revision_ids = {} - # path -> file-ids - as generated - self.file_ids = {} + # (path, branch_ref) -> file-ids - as generated. + # (Use store_file_id/fetch_fileid methods rather than direct access.) + self._file_ids = {} # Head tracking: last ref, last id per ref & map of commit ids to ref*s* self.last_ref = None @@ -67,22 +68,26 @@ class CacheManager(object): note("Cache statistics:") self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note) self._show_stats_for(self.revision_ids, "revision-ids", note=note) - self._show_stats_for(self.file_ids, "file-ids", note=note) + self._show_stats_for(self._file_ids, "file-ids", note=note, + tuple_key=True) # These aren't interesting so omit from the output, at least for now #self._show_stats_for(self._blobs, "other blobs", note=note) #self._show_stats_for(self.last_ids, "last-ids", note=note) #self._show_stats_for(self.heads, "heads", note=note) - def _show_stats_for(self, dict, label, note=trace.note): + def _show_stats_for(self, dict, label, note=trace.note, tuple_key=False): """Dump statistics about a given dictionary. By the key and value need to support len(). """ count = len(dict) - size = sum(map(len, dict.keys())) + if tuple_key: + size = sum(map(len, (''.join(k) for k in dict.keys()))) + else: + size = sum(map(len, dict.keys())) size += sum(map(len, dict.values())) kbytes = size * 1.0 / 1024 - note(" %-12s: %8.1fs kB (%d %s)" % (label, kbytes, count, + note(" %-12s: %8.1f kB (%d %s)" % (label, kbytes, count, helpers.single_plural(count, "item", "items"))) def clear_all(self): @@ -90,7 +95,7 @@ class CacheManager(object): self._blobs.clear() self._sticky_blobs.clear() self.revision_ids.clear() - self.file_ids.clear() + self._file_ids.clear() self.last_ids.clear() self.heads.clear() self.inventories.clear() @@ -110,18 +115,47 @@ class CacheManager(object): except KeyError: return self._blobs.pop(id) - def delete_path(self, path): + def store_file_id(self, branch_ref, path, id): + """Store the path to file-id mapping for a branch.""" + key = self._fileid_key(path, branch_ref) + self._file_ids[key] = id + + def fetch_file_id(self, branch_ref, path): + """Lookup the file-id for a path in a branch. + + Raises KeyError if unsuccessful. + """ + key = self._fileid_key(path, branch_ref) + return self._file_ids[key] + + def _fileid_key(self, path, branch_ref): + return (path, branch_ref) + + def delete_path(self, branch_ref, path): """Remove a path from caches.""" - # we actually want to remember what file-id we gave a path, - # even when that file is deleted, so doing nothing is correct + # We actually want to remember what file-id we gave a path, + # even when that file is deleted, so doing nothing is correct. + # It's quite possible for a path to be deleted twice where + # the first time is in a merge branch (but the same branch_ref) + # and the second time is when that branch is merged to mainline. pass - def rename_path(self, old_path, new_path): + def rename_path(self, branch_ref, old_path, new_path): """Rename a path in the caches.""" # In this case, we need to forget the file-id we gave a path, - # otherwise, we'll get duplicate file-ids in the repository. - self.file_ids[new_path] = self.file_ids[old_path] - del self.file_ids[old_path] + # otherwise, we'll get duplicate file-ids in the repository + # if a new file is created at the old path. + old_key = self._fileid_key(old_path, branch_ref) + new_key = self._fileid_key(new_path, branch_ref) + try: + old_file_id = self._file_ids[old_key] + except KeyError: + # The old_key has already been removed, most likely + # in a merge branch. + pass + else: + self._file_ids[new_key] = old_file_id + del self._file_ids[old_key] def track_heads(self, cmd): """Track the repository heads given a CommitCommand. diff --git a/processors/generic_processor.py b/processors/generic_processor.py index fe83d5a..904b911 100644 --- a/processors/generic_processor.py +++ b/processors/generic_processor.py @@ -414,15 +414,10 @@ class GenericProcessor(processor.ImportProcessor): pass self.cache_mgr._blobs = {} self._revision_count += 1 - # If we're finished getting back to where we were, - # load the file-ids cache - if self._revision_count == self.skip_total: - self._gen_file_ids_cache() return if self.first_incremental_commit: self.first_incremental_commit = None parents = self.cache_mgr.track_heads(cmd) - self._gen_file_ids_cache(parents) # 'Commit' the revision and report progress handler = self.commit_handler_factory(cmd, self.cache_mgr, @@ -442,28 +437,6 @@ class GenericProcessor(processor.ImportProcessor): self._revision_count) self.checkpoint_handler(None) - def _gen_file_ids_cache(self, revs=False): - """Generate the file-id cache by searching repository inventories. - """ - # Get the interesting revisions - the heads - if revs: - head_ids = revs - else: - head_ids = self.cache_mgr.heads.keys() - revision_ids = [self.cache_mgr.revision_ids[h] for h in head_ids] - - # Update the fileid cache - file_ids = {} - for revision_id in revision_ids: - self.note("Collecting file-ids for head %s ..." % revision_id) - inv = self.repo.revision_tree(revision_id).inventory - # Cache the inventories while we're at it - self.cache_mgr.inventories[revision_id] = inv - for path, ie in inv.iter_entries(): - file_ids[path] = ie.file_id - self.cache_mgr.file_ids = file_ids - self.note("Generated the file-ids cache - %d entries" % len(file_ids)) - def report_progress(self, details=''): if self._revision_count % self.progress_every == 0: if self.total_commits is not None: |