diff options
author | Ian Clatworthy <ian.clatworthy@canonical.com> | 2009-03-09 23:37:17 +1000 |
---|---|---|
committer | Ian Clatworthy <ian.clatworthy@canonical.com> | 2009-03-09 23:37:17 +1000 |
commit | 5804f1f560c28bcaca8fd168e0b9406b297f1b91 (patch) | |
tree | bae407263ba83211250ebdb82e33b423ef5baa03 | |
parent | 672b8e9625a0389dfcee0f7372c32e1e8d8643c6 (diff) | |
download | bzr-fastimport-5804f1f560c28bcaca8fd168e0b9406b297f1b91.tar.gz |
clear caches before packing; show cache stats in verbose mode
-rw-r--r-- | cache_manager.py | 38 | ||||
-rw-r--r-- | processors/generic_processor.py | 10 |
2 files changed, 44 insertions, 4 deletions
diff --git a/cache_manager.py b/cache_manager.py index 7a00598..527b288 100644 --- a/cache_manager.py +++ b/cache_manager.py @@ -17,8 +17,8 @@ """A manager of caches.""" -from bzrlib import lru_cache - +from bzrlib import lru_cache, trace +from bzrlib.plugins.fastimport import helpers class CacheManager(object): @@ -61,6 +61,40 @@ class CacheManager(object): # info not in file - possible when no blobs used pass + def dump_stats(self, note=trace.note): + """Dump some statistics about what we cached.""" + # TODO: add in inventory stastistics + note("Cache statistics:") + self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note) + self._show_stats_for(self.revision_ids, "revision-ids", note=note) + self._show_stats_for(self.file_ids, "file-ids", note=note) + # These aren't interesting so omit from the output, at least for now + #self._show_stats_for(self._blobs, "other blobs", note=note) + #self._show_stats_for(self.last_ids, "last-ids", note=note) + #self._show_stats_for(self.heads, "heads", note=note) + + def _show_stats_for(self, dict, label, note=trace.note): + """Dump statistics about a given dictionary. + + By the key and value need to support len(). + """ + count = len(dict) + size = sum(map(len, dict.keys())) + size += sum(map(len, dict.values())) + kbytes = size * 1.0 / 1024 + note(" %-12s: %8.1fs kB (%d %s)" % (label, kbytes, count, + helpers.single_plural(count, "item", "items"))) + + def clear_all(self): + """Free up any memory used by the caches.""" + self._blobs.clear() + self._sticky_blobs.clear() + self.revision_ids.clear() + self.file_ids.clear() + self.last_ids.clear() + self.heads.clear() + self.inventories.clear() + def store_blob(self, id, data): """Store a blob of data.""" if (self._blobs_to_keep is None or data == '' or diff --git a/processors/generic_processor.py b/processors/generic_processor.py index fd39be1..ecca7dd 100644 --- a/processors/generic_processor.py +++ b/processors/generic_processor.py @@ -314,6 +314,10 @@ class GenericProcessor(processor.ImportProcessor): # checkpoint instead. We now pack the repository to optimise # how data is stored. if self._revision_count > self.checkpoint_every: + # Free whatever memory we can before packing + import gc + self.cache_mgr.clear_all() + gc.collect() self.note("Packing repository ...") self.repo.pack() # To be conservative, packing puts the old packs and @@ -357,6 +361,8 @@ class GenericProcessor(processor.ImportProcessor): bc, helpers.single_plural(bc, "branch", "branches"), wtc, helpers.single_plural(wtc, "tree", "trees"), time_required) + if self.verbose: + self.cache_mgr.dump_stats() def _init_id_map(self): """Load the id-map and check it matches the repository. @@ -410,8 +416,6 @@ class GenericProcessor(processor.ImportProcessor): # load the file-ids cache if self._revision_count == self.skip_total: self._gen_file_ids_cache() - self.note("Generated the file-ids cache - %d entries", - len(self.cache_mgr.file_ids.keys())) return if self.first_incremental_commit: self.first_incremental_commit = None @@ -449,12 +453,14 @@ class GenericProcessor(processor.ImportProcessor): # Update the fileid cache file_ids = {} for revision_id in revision_ids: + self.note("Collecting file-ids for head %s ..." % revision_id) inv = self.repo.revision_tree(revision_id).inventory # Cache the inventories while we're at it self.cache_mgr.inventories[revision_id] = inv for path, ie in inv.iter_entries(): file_ids[path] = ie.file_id self.cache_mgr.file_ids = file_ids + self.note("Generated the file-ids cache - %d entries" % len(file_ids)) def report_progress(self, details=''): if self._revision_count % self.progress_every == 0: |