summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIan Clatworthy <ian.clatworthy@internode.on.net>2009-04-02 11:11:15 +1000
committerIan Clatworthy <ian.clatworthy@internode.on.net>2009-04-02 11:11:15 +1000
commitc9367c8e33d8bb31404af71280e3e9bae429fa04 (patch)
tree9028103be5e5f8840a34c9de72d11c72ba60ca59
parent795b2ddabf9fdf2aa21df73ec1d8cd5c8874e365 (diff)
downloadbzr-fastimport-c9367c8e33d8bb31404af71280e3e9bae429fa04.tar.gz
blob reference counting, not just sticky vs otherwise
-rw-r--r--cache_manager.py30
-rw-r--r--processors/generic_processor.py17
-rw-r--r--processors/info_processor.py37
3 files changed, 59 insertions, 25 deletions
diff --git a/cache_manager.py b/cache_manager.py
index cf28bee..1dff66b 100644
--- a/cache_manager.py
+++ b/cache_manager.py
@@ -54,10 +54,15 @@ class CacheManager(object):
self.heads = {}
# Work out the blobs to make sticky - None means all
- self._blobs_to_keep = None
+ self._blob_ref_counts = {}
if info is not None:
try:
- self._blobs_to_keep = info['Blob usage tracking']['multi']
+ blobs_by_counts = info['Blob reference counts']
+ # The parser hands values back as lists, already parsed
+ for count, blob_list in blobs_by_counts.items():
+ n = int(count)
+ for b in blob_list:
+ self._blob_ref_counts[b] = n
except KeyError:
# info not in file - possible when no blobs used
pass
@@ -86,8 +91,15 @@ class CacheManager(object):
else:
size = sum(map(len, dict.keys()))
size += sum(map(len, dict.values()))
- kbytes = size * 1.0 / 1024
- note(" %-12s: %8.1f kB (%d %s)" % (label, kbytes, count,
+ size = size * 1.0 / 1024
+ unit = 'K'
+ if size > 1024:
+ size = size / 1024
+ unit = 'M'
+ if size > 1024:
+ size = size / 1024
+ unit = 'G'
+ note(" %-12s: %8.1f %s (%d %s)" % (label, size, unit, count,
helpers.single_plural(count, "item", "items")))
def clear_all(self):
@@ -102,8 +114,7 @@ class CacheManager(object):
def store_blob(self, id, data):
"""Store a blob of data."""
- if (self._blobs_to_keep is None or data == '' or
- id in self._blobs_to_keep):
+ if data == '' or id in self._blob_ref_counts:
self._sticky_blobs[id] = data
else:
self._blobs[id] = data
@@ -111,7 +122,12 @@ class CacheManager(object):
def fetch_blob(self, id):
"""Fetch a blob of data."""
try:
- return self._sticky_blobs[id]
+ b = self._sticky_blobs[id]
+ if b != '':
+ self._blob_ref_counts[id] -= 1
+ if self._blob_ref_counts[id] == 0:
+ del self._sticky_blobs[id]
+ return b
except KeyError:
return self._blobs.pop(id)
diff --git a/processors/generic_processor.py b/processors/generic_processor.py
index f4fc365..b7f5d29 100644
--- a/processors/generic_processor.py
+++ b/processors/generic_processor.py
@@ -153,7 +153,7 @@ class GenericProcessor(processor.ImportProcessor):
# We want to repack at the end anyhow when more information
# is available to do a better job of saving space.
try:
- from bzrlib.plugins.groupcompress import groupcompress
+ from bzrlib import groupcompress
groupcompress._FAST = True
except ImportError:
pass
@@ -285,7 +285,7 @@ class GenericProcessor(processor.ImportProcessor):
branch_name = lost_info[0]
self.note("\t %s = %s", head_revision, branch_name)
- # Update the working trees as requested and dump stats
+ # Update the working trees as requested
self._tree_count = 0
remind_about_update = True
if self._branch_count == 0:
@@ -306,15 +306,19 @@ class GenericProcessor(processor.ImportProcessor):
remind_about_update = False
else:
self.warning("No working trees available to update")
- self.dump_stats()
+ # Dum pthe cache stats now because we clear it before the final pack
+ if self.verbose:
+ self.cache_mgr.dump_stats()
if self._original_max_pack_count:
# We earlier disabled autopacking, creating one pack every
# checkpoint instead. We now pack the repository to optimise
# how data is stored.
+ self.cache_mgr.clear_all()
self._pack_repository()
- # Finish up by telling the user what to do next.
+ # Finish up by dumping stats & telling the user what to do next.
+ self.dump_stats()
if remind_about_update:
# This message is explicitly not timestamped.
note("To refresh the working tree for a branch, "
@@ -325,9 +329,8 @@ class GenericProcessor(processor.ImportProcessor):
# that groupcompress is configured to optimise disk space
import gc
if final:
- self.cache_mgr.clear_all()
try:
- from bzrlib.plugins.groupcompress import groupcompress
+ from bzrlib import groupcompress
except ImportError:
pass
else:
@@ -377,8 +380,6 @@ class GenericProcessor(processor.ImportProcessor):
bc, helpers.single_plural(bc, "branch", "branches"),
wtc, helpers.single_plural(wtc, "tree", "trees"),
time_required)
- if self.verbose:
- self.cache_mgr.dump_stats()
def _init_id_map(self):
"""Load the id-map and check it matches the repository.
diff --git a/processors/info_processor.py b/processors/info_processor.py
index 0e05c37..8296869 100644
--- a/processors/info_processor.py
+++ b/processors/info_processor.py
@@ -40,7 +40,7 @@ class InfoProcessor(processor.ImportProcessor):
the source.
"""
- def __init__(self, target=None, params=None, verbose=False):
+ def __init__(self, target=None, params=None, verbose=0):
# Allow creation without a target
processor.ImportProcessor.__init__(self, target, params, verbose)
@@ -62,8 +62,9 @@ class InfoProcessor(processor.ImportProcessor):
self.lightweight_tags = 0
# Blob usage tracking
self.blobs = {}
- for usage in ['new', 'used', 'multi', 'unknown', 'unmarked']:
+ for usage in ['new', 'used', 'unknown', 'unmarked']:
self.blobs[usage] = set()
+ self.blob_ref_counts = {}
# Head tracking - delegate to the cache manager
self.cache_mgr = cache_manager.CacheManager(inventory_cache_size=0)
# Stuff to cache: a map from mark to # of times that mark is merged
@@ -108,10 +109,18 @@ class InfoProcessor(processor.ImportProcessor):
# note("\t%d\t%s" % (len(self.committers), 'unique committers'))
self._dump_stats_group("Merges", self.merges.keys(),
self.merges.values(), None)
- self._dump_stats_group("Rename old paths", self.rename_old_paths.keys(),
- self.rename_old_paths.values(), len, _iterable_as_config_list)
- self._dump_stats_group("Copy source paths", self.copy_source_paths.keys(),
- self.copy_source_paths.values(), len, _iterable_as_config_list)
+ # We only show the rename old path and copy source paths when -vv
+ # (verbose=2) is specified. The output here for mysql's data can't
+ # be parsed currently so this bit of code needs more work anyhow ..
+ if self.verbose >= 2:
+ self._dump_stats_group("Rename old paths",
+ self.rename_old_paths.keys(),
+ self.rename_old_paths.values(), len,
+ _iterable_as_config_list)
+ self._dump_stats_group("Copy source paths",
+ self.copy_source_paths.keys(),
+ self.copy_source_paths.values(), len,
+ _iterable_as_config_list)
# Blob stats
if self.cmd_counts['blob']:
@@ -120,6 +129,11 @@ class InfoProcessor(processor.ImportProcessor):
del self.blobs['used']
self._dump_stats_group("Blob usage tracking", self.blobs.keys(),
self.blobs.values(), len, _iterable_as_config_list)
+ if self.blob_ref_counts:
+ blobs_by_count = helpers.invert_dict(self.blob_ref_counts)
+ self._dump_stats_group("Blob reference counts",
+ blobs_by_count.keys(),
+ blobs_by_count.values(), len, _iterable_as_config_list)
# Other stats
if self.cmd_counts['reset']:
@@ -145,14 +159,16 @@ class InfoProcessor(processor.ImportProcessor):
for name, value in zip(names, values):
if verbose_formatter is not None:
value = verbose_formatter(value)
- print "%s = %s" % (name.replace(' ', '-'),value)
+ if type(name) == str:
+ name = name.replace(' ', '-')
+ print "%s = %s" % (name, value)
print ""
else:
print "%s:" % (title,)
for name, value in zip(names, values):
if normal_formatter is not None:
value = normal_formatter(value)
- print "\t%s\t%s" % (value,name)
+ print "\t%s\t%s" % (value, name)
def progress_handler(self, cmd):
"""Process a ProgressCommand."""
@@ -235,10 +251,11 @@ class InfoProcessor(processor.ImportProcessor):
self.cmd_counts[cmd.name] += 1
def _track_blob(self, mark):
- if mark in self.blobs['multi']:
+ if mark in self.blob_ref_counts:
+ self.blob_ref_counts[mark] += 1
pass
elif mark in self.blobs['used']:
- self.blobs['multi'].add(mark)
+ self.blob_ref_counts[mark] = 2
self.blobs['used'].remove(mark)
elif mark in self.blobs['new']:
self.blobs['used'].add(mark)