clear caches before packing; show cache stats in verbose mode

author: Ian Clatworthy <ian.clatworthy@canonical.com> 2009-03-09 23:37:17 +1000
committer: Ian Clatworthy <ian.clatworthy@canonical.com> 2009-03-09 23:37:17 +1000
commit: 5804f1f560c28bcaca8fd168e0b9406b297f1b91 (patch)
tree: bae407263ba83211250ebdb82e33b423ef5baa03
parent: 672b8e9625a0389dfcee0f7372c32e1e8d8643c6 (diff)
download: bzr-fastimport-5804f1f560c28bcaca8fd168e0b9406b297f1b91.tar.gz
2 files changed, 44 insertions, 4 deletions
diff --git a/cache_manager.py b/cache_manager.py
index 7a00598..527b288 100644
--- a/cache_manager.py
+++ b/cache_manager.py
@@ -17,8 +17,8 @@
 """A manager of caches."""
 
 
-from bzrlib import lru_cache
-
+from bzrlib import lru_cache, trace
+from bzrlib.plugins.fastimport import helpers
 
 class CacheManager(object):
 
@@ -61,6 +61,40 @@ class CacheManager(object):
                 # info not in file - possible when no blobs used
                 pass
 
+    def dump_stats(self, note=trace.note):
+        """Dump some statistics about what we cached."""
+        # TODO: add in inventory stastistics
+        note("Cache statistics:")
+        self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
+        self._show_stats_for(self.revision_ids, "revision-ids", note=note)
+        self._show_stats_for(self.file_ids, "file-ids", note=note)
+        # These aren't interesting so omit from the output, at least for now
+        #self._show_stats_for(self._blobs, "other blobs", note=note)
+        #self._show_stats_for(self.last_ids, "last-ids", note=note)
+        #self._show_stats_for(self.heads, "heads", note=note)
+
+    def _show_stats_for(self, dict, label, note=trace.note):
+        """Dump statistics about a given dictionary.
+
+        By the key and value need to support len().
+        """
+        count = len(dict)
+        size = sum(map(len, dict.keys()))
+        size += sum(map(len, dict.values()))
+        kbytes = size * 1.0 / 1024
+        note("    %-12s: %8.1fs kB (%d %s)" % (label, kbytes, count,
+            helpers.single_plural(count, "item", "items")))
+
+    def clear_all(self):
+        """Free up any memory used by the caches."""
+        self._blobs.clear()
+        self._sticky_blobs.clear()
+        self.revision_ids.clear()
+        self.file_ids.clear()
+        self.last_ids.clear()
+        self.heads.clear()
+        self.inventories.clear()
+
     def store_blob(self, id, data):
         """Store a blob of data."""
         if (self._blobs_to_keep is None or data == '' or
diff --git a/processors/generic_processor.py b/processors/generic_processor.py
index fd39be1..ecca7dd 100644
--- a/processors/generic_processor.py
+++ b/processors/generic_processor.py
@@ -314,6 +314,10 @@ class GenericProcessor(processor.ImportProcessor):
             # checkpoint instead. We now pack the repository to optimise
             # how data is stored.
             if self._revision_count > self.checkpoint_every:
+                # Free whatever memory we can before packing
+                import gc
+                self.cache_mgr.clear_all()
+                gc.collect()
                 self.note("Packing repository ...")
                 self.repo.pack()
                 # To be conservative, packing puts the old packs and
@@ -357,6 +361,8 @@ class GenericProcessor(processor.ImportProcessor):
             bc, helpers.single_plural(bc, "branch", "branches"),
             wtc, helpers.single_plural(wtc, "tree", "trees"),
             time_required)
+        if self.verbose:
+            self.cache_mgr.dump_stats()
 
     def _init_id_map(self):
         """Load the id-map and check it matches the repository.
@@ -410,8 +416,6 @@ class GenericProcessor(processor.ImportProcessor):
             # load the file-ids cache
             if self._revision_count == self.skip_total:
                 self._gen_file_ids_cache()
-                self.note("Generated the file-ids cache - %d entries",
-                    len(self.cache_mgr.file_ids.keys()))
             return
         if self.first_incremental_commit:
             self.first_incremental_commit = None
@@ -449,12 +453,14 @@ class GenericProcessor(processor.ImportProcessor):
         # Update the fileid cache
         file_ids = {}
         for revision_id in revision_ids:
+            self.note("Collecting file-ids for head %s ..." % revision_id)
             inv = self.repo.revision_tree(revision_id).inventory
             # Cache the inventories while we're at it
             self.cache_mgr.inventories[revision_id] = inv
             for path, ie in inv.iter_entries():
                 file_ids[path] = ie.file_id
         self.cache_mgr.file_ids = file_ids
+        self.note("Generated the file-ids cache - %d entries" % len(file_ids))
 
     def report_progress(self, details=''):
         if self._revision_count % self.progress_every == 0:
author	Ian Clatworthy <ian.clatworthy@canonical.com>	2009-03-09 23:37:17 +1000
committer	Ian Clatworthy <ian.clatworthy@canonical.com>	2009-03-09 23:37:17 +1000
commit	5804f1f560c28bcaca8fd168e0b9406b297f1b91 (patch)
tree	bae407263ba83211250ebdb82e33b423ef5baa03
parent	672b8e9625a0389dfcee0f7372c32e1e8d8643c6 (diff)
download	bzr-fastimport-5804f1f560c28bcaca8fd168e0b9406b297f1b91.tar.gz