diff options
author | Ian Clatworthy <ian.clatworthy@internode.on.net> | 2009-04-02 11:11:15 +1000 |
---|---|---|
committer | Ian Clatworthy <ian.clatworthy@internode.on.net> | 2009-04-02 11:11:15 +1000 |
commit | c9367c8e33d8bb31404af71280e3e9bae429fa04 (patch) | |
tree | 9028103be5e5f8840a34c9de72d11c72ba60ca59 | |
parent | 795b2ddabf9fdf2aa21df73ec1d8cd5c8874e365 (diff) | |
download | bzr-fastimport-c9367c8e33d8bb31404af71280e3e9bae429fa04.tar.gz |
blob reference counting, not just sticky vs otherwise
-rw-r--r-- | cache_manager.py | 30 | ||||
-rw-r--r-- | processors/generic_processor.py | 17 | ||||
-rw-r--r-- | processors/info_processor.py | 37 |
3 files changed, 59 insertions, 25 deletions
diff --git a/cache_manager.py b/cache_manager.py index cf28bee..1dff66b 100644 --- a/cache_manager.py +++ b/cache_manager.py @@ -54,10 +54,15 @@ class CacheManager(object): self.heads = {} # Work out the blobs to make sticky - None means all - self._blobs_to_keep = None + self._blob_ref_counts = {} if info is not None: try: - self._blobs_to_keep = info['Blob usage tracking']['multi'] + blobs_by_counts = info['Blob reference counts'] + # The parser hands values back as lists, already parsed + for count, blob_list in blobs_by_counts.items(): + n = int(count) + for b in blob_list: + self._blob_ref_counts[b] = n except KeyError: # info not in file - possible when no blobs used pass @@ -86,8 +91,15 @@ class CacheManager(object): else: size = sum(map(len, dict.keys())) size += sum(map(len, dict.values())) - kbytes = size * 1.0 / 1024 - note(" %-12s: %8.1f kB (%d %s)" % (label, kbytes, count, + size = size * 1.0 / 1024 + unit = 'K' + if size > 1024: + size = size / 1024 + unit = 'M' + if size > 1024: + size = size / 1024 + unit = 'G' + note(" %-12s: %8.1f %s (%d %s)" % (label, size, unit, count, helpers.single_plural(count, "item", "items"))) def clear_all(self): @@ -102,8 +114,7 @@ class CacheManager(object): def store_blob(self, id, data): """Store a blob of data.""" - if (self._blobs_to_keep is None or data == '' or - id in self._blobs_to_keep): + if data == '' or id in self._blob_ref_counts: self._sticky_blobs[id] = data else: self._blobs[id] = data @@ -111,7 +122,12 @@ class CacheManager(object): def fetch_blob(self, id): """Fetch a blob of data.""" try: - return self._sticky_blobs[id] + b = self._sticky_blobs[id] + if b != '': + self._blob_ref_counts[id] -= 1 + if self._blob_ref_counts[id] == 0: + del self._sticky_blobs[id] + return b except KeyError: return self._blobs.pop(id) diff --git a/processors/generic_processor.py b/processors/generic_processor.py index f4fc365..b7f5d29 100644 --- a/processors/generic_processor.py +++ b/processors/generic_processor.py @@ -153,7 +153,7 @@ class GenericProcessor(processor.ImportProcessor): # We want to repack at the end anyhow when more information # is available to do a better job of saving space. try: - from bzrlib.plugins.groupcompress import groupcompress + from bzrlib import groupcompress groupcompress._FAST = True except ImportError: pass @@ -285,7 +285,7 @@ class GenericProcessor(processor.ImportProcessor): branch_name = lost_info[0] self.note("\t %s = %s", head_revision, branch_name) - # Update the working trees as requested and dump stats + # Update the working trees as requested self._tree_count = 0 remind_about_update = True if self._branch_count == 0: @@ -306,15 +306,19 @@ class GenericProcessor(processor.ImportProcessor): remind_about_update = False else: self.warning("No working trees available to update") - self.dump_stats() + # Dum pthe cache stats now because we clear it before the final pack + if self.verbose: + self.cache_mgr.dump_stats() if self._original_max_pack_count: # We earlier disabled autopacking, creating one pack every # checkpoint instead. We now pack the repository to optimise # how data is stored. + self.cache_mgr.clear_all() self._pack_repository() - # Finish up by telling the user what to do next. + # Finish up by dumping stats & telling the user what to do next. + self.dump_stats() if remind_about_update: # This message is explicitly not timestamped. note("To refresh the working tree for a branch, " @@ -325,9 +329,8 @@ class GenericProcessor(processor.ImportProcessor): # that groupcompress is configured to optimise disk space import gc if final: - self.cache_mgr.clear_all() try: - from bzrlib.plugins.groupcompress import groupcompress + from bzrlib import groupcompress except ImportError: pass else: @@ -377,8 +380,6 @@ class GenericProcessor(processor.ImportProcessor): bc, helpers.single_plural(bc, "branch", "branches"), wtc, helpers.single_plural(wtc, "tree", "trees"), time_required) - if self.verbose: - self.cache_mgr.dump_stats() def _init_id_map(self): """Load the id-map and check it matches the repository. diff --git a/processors/info_processor.py b/processors/info_processor.py index 0e05c37..8296869 100644 --- a/processors/info_processor.py +++ b/processors/info_processor.py @@ -40,7 +40,7 @@ class InfoProcessor(processor.ImportProcessor): the source. """ - def __init__(self, target=None, params=None, verbose=False): + def __init__(self, target=None, params=None, verbose=0): # Allow creation without a target processor.ImportProcessor.__init__(self, target, params, verbose) @@ -62,8 +62,9 @@ class InfoProcessor(processor.ImportProcessor): self.lightweight_tags = 0 # Blob usage tracking self.blobs = {} - for usage in ['new', 'used', 'multi', 'unknown', 'unmarked']: + for usage in ['new', 'used', 'unknown', 'unmarked']: self.blobs[usage] = set() + self.blob_ref_counts = {} # Head tracking - delegate to the cache manager self.cache_mgr = cache_manager.CacheManager(inventory_cache_size=0) # Stuff to cache: a map from mark to # of times that mark is merged @@ -108,10 +109,18 @@ class InfoProcessor(processor.ImportProcessor): # note("\t%d\t%s" % (len(self.committers), 'unique committers')) self._dump_stats_group("Merges", self.merges.keys(), self.merges.values(), None) - self._dump_stats_group("Rename old paths", self.rename_old_paths.keys(), - self.rename_old_paths.values(), len, _iterable_as_config_list) - self._dump_stats_group("Copy source paths", self.copy_source_paths.keys(), - self.copy_source_paths.values(), len, _iterable_as_config_list) + # We only show the rename old path and copy source paths when -vv + # (verbose=2) is specified. The output here for mysql's data can't + # be parsed currently so this bit of code needs more work anyhow .. + if self.verbose >= 2: + self._dump_stats_group("Rename old paths", + self.rename_old_paths.keys(), + self.rename_old_paths.values(), len, + _iterable_as_config_list) + self._dump_stats_group("Copy source paths", + self.copy_source_paths.keys(), + self.copy_source_paths.values(), len, + _iterable_as_config_list) # Blob stats if self.cmd_counts['blob']: @@ -120,6 +129,11 @@ class InfoProcessor(processor.ImportProcessor): del self.blobs['used'] self._dump_stats_group("Blob usage tracking", self.blobs.keys(), self.blobs.values(), len, _iterable_as_config_list) + if self.blob_ref_counts: + blobs_by_count = helpers.invert_dict(self.blob_ref_counts) + self._dump_stats_group("Blob reference counts", + blobs_by_count.keys(), + blobs_by_count.values(), len, _iterable_as_config_list) # Other stats if self.cmd_counts['reset']: @@ -145,14 +159,16 @@ class InfoProcessor(processor.ImportProcessor): for name, value in zip(names, values): if verbose_formatter is not None: value = verbose_formatter(value) - print "%s = %s" % (name.replace(' ', '-'),value) + if type(name) == str: + name = name.replace(' ', '-') + print "%s = %s" % (name, value) print "" else: print "%s:" % (title,) for name, value in zip(names, values): if normal_formatter is not None: value = normal_formatter(value) - print "\t%s\t%s" % (value,name) + print "\t%s\t%s" % (value, name) def progress_handler(self, cmd): """Process a ProgressCommand.""" @@ -235,10 +251,11 @@ class InfoProcessor(processor.ImportProcessor): self.cmd_counts[cmd.name] += 1 def _track_blob(self, mark): - if mark in self.blobs['multi']: + if mark in self.blob_ref_counts: + self.blob_ref_counts[mark] += 1 pass elif mark in self.blobs['used']: - self.blobs['multi'].add(mark) + self.blob_ref_counts[mark] = 2 self.blobs['used'].remove(mark) elif mark in self.blobs['new']: self.blobs['used'].add(mark) |