blob reference counting, not just sticky vs otherwise

author: Ian Clatworthy <ian.clatworthy@internode.on.net> 2009-04-02 11:11:15 +1000
committer: Ian Clatworthy <ian.clatworthy@internode.on.net> 2009-04-02 11:11:15 +1000
commit: c9367c8e33d8bb31404af71280e3e9bae429fa04 (patch)
tree: 9028103be5e5f8840a34c9de72d11c72ba60ca59
parent: 795b2ddabf9fdf2aa21df73ec1d8cd5c8874e365 (diff)
download: bzr-fastimport-c9367c8e33d8bb31404af71280e3e9bae429fa04.tar.gz
3 files changed, 59 insertions, 25 deletions
diff --git a/cache_manager.py b/cache_manager.py
index cf28bee..1dff66b 100644
--- a/cache_manager.py
+++ b/cache_manager.py
@@ -54,10 +54,15 @@ class CacheManager(object):
         self.heads = {}
 
         # Work out the blobs to make sticky - None means all
-        self._blobs_to_keep = None
+        self._blob_ref_counts = {}
         if info is not None:
             try:
-                self._blobs_to_keep = info['Blob usage tracking']['multi']
+                blobs_by_counts = info['Blob reference counts']
+                # The parser hands values back as lists, already parsed
+                for count, blob_list in blobs_by_counts.items():
+                    n = int(count)
+                    for b in blob_list:
+                        self._blob_ref_counts[b] = n
             except KeyError:
                 # info not in file - possible when no blobs used
                 pass
@@ -86,8 +91,15 @@ class CacheManager(object):
         else:
             size = sum(map(len, dict.keys()))
         size += sum(map(len, dict.values()))
-        kbytes = size * 1.0 / 1024
-        note("    %-12s: %8.1f kB (%d %s)" % (label, kbytes, count,
+        size = size * 1.0 / 1024
+        unit = 'K'
+        if size > 1024:
+            size = size / 1024
+            unit = 'M'
+            if size > 1024:
+                size = size / 1024
+                unit = 'G'
+        note("    %-12s: %8.1f %s (%d %s)" % (label, size, unit, count,
             helpers.single_plural(count, "item", "items")))
 
     def clear_all(self):
@@ -102,8 +114,7 @@ class CacheManager(object):
 
     def store_blob(self, id, data):
         """Store a blob of data."""
-        if (self._blobs_to_keep is None or data == '' or
-            id in self._blobs_to_keep):
+        if data == '' or id in self._blob_ref_counts:
             self._sticky_blobs[id] = data
         else:
             self._blobs[id] = data
@@ -111,7 +122,12 @@ class CacheManager(object):
     def fetch_blob(self, id):
         """Fetch a blob of data."""
         try:
-            return self._sticky_blobs[id]
+            b = self._sticky_blobs[id]
+            if b != '':
+                self._blob_ref_counts[id] -= 1
+                if self._blob_ref_counts[id] == 0:
+                    del self._sticky_blobs[id]
+            return b
         except KeyError:
             return self._blobs.pop(id)
 
diff --git a/processors/generic_processor.py b/processors/generic_processor.py
index f4fc365..b7f5d29 100644
--- a/processors/generic_processor.py
+++ b/processors/generic_processor.py
@@ -153,7 +153,7 @@ class GenericProcessor(processor.ImportProcessor):
         # We want to repack at the end anyhow when more information
         # is available to do a better job of saving space.
         try:
-            from bzrlib.plugins.groupcompress import groupcompress
+            from bzrlib import groupcompress
             groupcompress._FAST = True
         except ImportError:
             pass
@@ -285,7 +285,7 @@ class GenericProcessor(processor.ImportProcessor):
                 branch_name = lost_info[0]
                 self.note("\t %s = %s", head_revision, branch_name)
 
-        # Update the working trees as requested and dump stats
+        # Update the working trees as requested
         self._tree_count = 0
         remind_about_update = True
         if self._branch_count == 0:
@@ -306,15 +306,19 @@ class GenericProcessor(processor.ImportProcessor):
                 remind_about_update = False
             else:
                 self.warning("No working trees available to update")
-        self.dump_stats()
 
+        # Dum pthe cache stats now because we clear it before the final pack
+        if self.verbose:
+            self.cache_mgr.dump_stats()
         if self._original_max_pack_count:
             # We earlier disabled autopacking, creating one pack every
             # checkpoint instead. We now pack the repository to optimise
             # how data is stored.
+            self.cache_mgr.clear_all()
             self._pack_repository()
 
-        # Finish up by telling the user what to do next.
+        # Finish up by dumping stats & telling the user what to do next.
+        self.dump_stats()
         if remind_about_update:
             # This message is explicitly not timestamped.
             note("To refresh the working tree for a branch, "
@@ -325,9 +329,8 @@ class GenericProcessor(processor.ImportProcessor):
         # that groupcompress is configured to optimise disk space
         import gc
         if final:
-            self.cache_mgr.clear_all()
             try:
-                from bzrlib.plugins.groupcompress import groupcompress
+                from bzrlib import groupcompress
             except ImportError:
                 pass
             else:
@@ -377,8 +380,6 @@ class GenericProcessor(processor.ImportProcessor):
             bc, helpers.single_plural(bc, "branch", "branches"),
             wtc, helpers.single_plural(wtc, "tree", "trees"),
             time_required)
-        if self.verbose:
-            self.cache_mgr.dump_stats()
 
     def _init_id_map(self):
         """Load the id-map and check it matches the repository.
diff --git a/processors/info_processor.py b/processors/info_processor.py
index 0e05c37..8296869 100644
--- a/processors/info_processor.py
+++ b/processors/info_processor.py
@@ -40,7 +40,7 @@ class InfoProcessor(processor.ImportProcessor):
     the source.
     """
 
-    def __init__(self, target=None, params=None, verbose=False):
+    def __init__(self, target=None, params=None, verbose=0):
         # Allow creation without a target
         processor.ImportProcessor.__init__(self, target, params, verbose)
 
@@ -62,8 +62,9 @@ class InfoProcessor(processor.ImportProcessor):
         self.lightweight_tags = 0
         # Blob usage tracking
         self.blobs = {}
-        for usage in ['new', 'used', 'multi', 'unknown', 'unmarked']:
+        for usage in ['new', 'used', 'unknown', 'unmarked']:
             self.blobs[usage] = set()
+        self.blob_ref_counts = {}
         # Head tracking - delegate to the cache manager
         self.cache_mgr = cache_manager.CacheManager(inventory_cache_size=0)
         # Stuff to cache: a map from mark to # of times that mark is merged
@@ -108,10 +109,18 @@ class InfoProcessor(processor.ImportProcessor):
             # note("\t%d\t%s" % (len(self.committers), 'unique committers'))
             self._dump_stats_group("Merges", self.merges.keys(),
                 self.merges.values(), None)
-            self._dump_stats_group("Rename old paths", self.rename_old_paths.keys(),
-                self.rename_old_paths.values(), len, _iterable_as_config_list)
-            self._dump_stats_group("Copy source paths", self.copy_source_paths.keys(),
-                self.copy_source_paths.values(), len, _iterable_as_config_list)
+            # We only show the rename old path and copy source paths when -vv
+            # (verbose=2) is specified. The output here for mysql's data can't
+            # be parsed currently so this bit of code needs more work anyhow ..
+            if self.verbose >= 2:
+                self._dump_stats_group("Rename old paths",
+                    self.rename_old_paths.keys(),
+                    self.rename_old_paths.values(), len,
+                    _iterable_as_config_list)
+                self._dump_stats_group("Copy source paths",
+                    self.copy_source_paths.keys(),
+                    self.copy_source_paths.values(), len,
+                    _iterable_as_config_list)
 
         # Blob stats
         if self.cmd_counts['blob']:
@@ -120,6 +129,11 @@ class InfoProcessor(processor.ImportProcessor):
                 del self.blobs['used']
             self._dump_stats_group("Blob usage tracking", self.blobs.keys(),
                 self.blobs.values(), len, _iterable_as_config_list)
+        if self.blob_ref_counts:
+            blobs_by_count = helpers.invert_dict(self.blob_ref_counts)
+            self._dump_stats_group("Blob reference counts",
+                blobs_by_count.keys(),
+                blobs_by_count.values(), len, _iterable_as_config_list)
 
         # Other stats
         if self.cmd_counts['reset']:
@@ -145,14 +159,16 @@ class InfoProcessor(processor.ImportProcessor):
             for name, value in zip(names, values):
                 if verbose_formatter is not None:
                     value = verbose_formatter(value)
-                print "%s = %s" % (name.replace(' ', '-'),value)
+                if type(name) == str:
+                    name = name.replace(' ', '-')
+                print "%s = %s" % (name, value)
             print ""
         else:
             print "%s:" % (title,)
             for name, value in zip(names, values):
                 if normal_formatter is not None:
                     value = normal_formatter(value)
-                print "\t%s\t%s" % (value,name)
+                print "\t%s\t%s" % (value, name)
 
     def progress_handler(self, cmd):
         """Process a ProgressCommand."""
@@ -235,10 +251,11 @@ class InfoProcessor(processor.ImportProcessor):
         self.cmd_counts[cmd.name] += 1
 
     def _track_blob(self, mark):
-        if mark in self.blobs['multi']:
+        if mark in self.blob_ref_counts:
+            self.blob_ref_counts[mark] += 1
             pass
         elif mark in self.blobs['used']:
-            self.blobs['multi'].add(mark)
+            self.blob_ref_counts[mark] = 2
             self.blobs['used'].remove(mark)
         elif mark in self.blobs['new']:
             self.blobs['used'].add(mark)
author	Ian Clatworthy <ian.clatworthy@internode.on.net>	2009-04-02 11:11:15 +1000
committer	Ian Clatworthy <ian.clatworthy@internode.on.net>	2009-04-02 11:11:15 +1000
commit	c9367c8e33d8bb31404af71280e3e9bae429fa04 (patch)
tree	9028103be5e5f8840a34c9de72d11c72ba60ca59
parent	795b2ddabf9fdf2aa21df73ec1d8cd5c8874e365 (diff)
download	bzr-fastimport-c9367c8e33d8bb31404af71280e3e9bae429fa04.tar.gz