summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIan Clatworthy <ian.clatworthy@canonical.com>2009-03-09 23:37:17 +1000
committerIan Clatworthy <ian.clatworthy@canonical.com>2009-03-09 23:37:17 +1000
commit5804f1f560c28bcaca8fd168e0b9406b297f1b91 (patch)
treebae407263ba83211250ebdb82e33b423ef5baa03
parent672b8e9625a0389dfcee0f7372c32e1e8d8643c6 (diff)
downloadbzr-fastimport-5804f1f560c28bcaca8fd168e0b9406b297f1b91.tar.gz
clear caches before packing; show cache stats in verbose mode
-rw-r--r--cache_manager.py38
-rw-r--r--processors/generic_processor.py10
2 files changed, 44 insertions, 4 deletions
diff --git a/cache_manager.py b/cache_manager.py
index 7a00598..527b288 100644
--- a/cache_manager.py
+++ b/cache_manager.py
@@ -17,8 +17,8 @@
"""A manager of caches."""
-from bzrlib import lru_cache
-
+from bzrlib import lru_cache, trace
+from bzrlib.plugins.fastimport import helpers
class CacheManager(object):
@@ -61,6 +61,40 @@ class CacheManager(object):
# info not in file - possible when no blobs used
pass
+ def dump_stats(self, note=trace.note):
+ """Dump some statistics about what we cached."""
+ # TODO: add in inventory stastistics
+ note("Cache statistics:")
+ self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
+ self._show_stats_for(self.revision_ids, "revision-ids", note=note)
+ self._show_stats_for(self.file_ids, "file-ids", note=note)
+ # These aren't interesting so omit from the output, at least for now
+ #self._show_stats_for(self._blobs, "other blobs", note=note)
+ #self._show_stats_for(self.last_ids, "last-ids", note=note)
+ #self._show_stats_for(self.heads, "heads", note=note)
+
+ def _show_stats_for(self, dict, label, note=trace.note):
+ """Dump statistics about a given dictionary.
+
+ By the key and value need to support len().
+ """
+ count = len(dict)
+ size = sum(map(len, dict.keys()))
+ size += sum(map(len, dict.values()))
+ kbytes = size * 1.0 / 1024
+ note(" %-12s: %8.1fs kB (%d %s)" % (label, kbytes, count,
+ helpers.single_plural(count, "item", "items")))
+
+ def clear_all(self):
+ """Free up any memory used by the caches."""
+ self._blobs.clear()
+ self._sticky_blobs.clear()
+ self.revision_ids.clear()
+ self.file_ids.clear()
+ self.last_ids.clear()
+ self.heads.clear()
+ self.inventories.clear()
+
def store_blob(self, id, data):
"""Store a blob of data."""
if (self._blobs_to_keep is None or data == '' or
diff --git a/processors/generic_processor.py b/processors/generic_processor.py
index fd39be1..ecca7dd 100644
--- a/processors/generic_processor.py
+++ b/processors/generic_processor.py
@@ -314,6 +314,10 @@ class GenericProcessor(processor.ImportProcessor):
# checkpoint instead. We now pack the repository to optimise
# how data is stored.
if self._revision_count > self.checkpoint_every:
+ # Free whatever memory we can before packing
+ import gc
+ self.cache_mgr.clear_all()
+ gc.collect()
self.note("Packing repository ...")
self.repo.pack()
# To be conservative, packing puts the old packs and
@@ -357,6 +361,8 @@ class GenericProcessor(processor.ImportProcessor):
bc, helpers.single_plural(bc, "branch", "branches"),
wtc, helpers.single_plural(wtc, "tree", "trees"),
time_required)
+ if self.verbose:
+ self.cache_mgr.dump_stats()
def _init_id_map(self):
"""Load the id-map and check it matches the repository.
@@ -410,8 +416,6 @@ class GenericProcessor(processor.ImportProcessor):
# load the file-ids cache
if self._revision_count == self.skip_total:
self._gen_file_ids_cache()
- self.note("Generated the file-ids cache - %d entries",
- len(self.cache_mgr.file_ids.keys()))
return
if self.first_incremental_commit:
self.first_incremental_commit = None
@@ -449,12 +453,14 @@ class GenericProcessor(processor.ImportProcessor):
# Update the fileid cache
file_ids = {}
for revision_id in revision_ids:
+ self.note("Collecting file-ids for head %s ..." % revision_id)
inv = self.repo.revision_tree(revision_id).inventory
# Cache the inventories while we're at it
self.cache_mgr.inventories[revision_id] = inv
for path, ie in inv.iter_entries():
file_ids[path] = ie.file_id
self.cache_mgr.file_ids = file_ids
+ self.note("Generated the file-ids cache - %d entries" % len(file_ids))
def report_progress(self, details=''):
if self._revision_count % self.progress_every == 0: