diff options
author | John Arbash Meinel <john@arbash-meinel.com> | 2009-11-13 13:16:20 -0600 |
---|---|---|
committer | John Arbash Meinel <john@arbash-meinel.com> | 2009-11-13 13:16:20 -0600 |
commit | 9db9f8e95c23e13390e3cdec19b55f9efaadf781 (patch) | |
tree | e78faf5403b46001915fb9a8efe19d0f8a37a70f /cache_manager.py | |
parent | e7503b47b93ca6efe91bdd3d7fff0c7ecbff1786 (diff) | |
download | bzr-fastimport-9db9f8e95c23e13390e3cdec19b55f9efaadf781.tar.gz |
Switch to closing the large-content blobs that we store to disk.
During the qt import, we end up with >2000 large blobs, and a few hundred MB of
small blobs. We don't want to create more small blobs, because their disk space
is not reclaimed, but >2000 is too many open file handles (at least on Windows).
Using filenames works, but we aren't as guaranteed that things will clean up
nicely.
Diffstat (limited to 'cache_manager.py')
-rw-r--r-- | cache_manager.py | 61 |
1 files changed, 40 insertions, 21 deletions
diff --git a/cache_manager.py b/cache_manager.py index 4227554..f35134f 100644 --- a/cache_manager.py +++ b/cache_manager.py @@ -42,7 +42,8 @@ class _Cleanup(object): def finalize(self): if self.disk_blobs is not None: for info in self.disk_blobs.itervalues(): - info[-1].close() + if info[-1] is not None: + os.unlink(info[-1]) self.disk_blobs = None if self.small_blobs is not None: self.small_blobs.close() @@ -53,8 +54,8 @@ class _Cleanup(object): class CacheManager(object): - _small_blob_threshold = 100*1024 - _sticky_cache_size = 200*1024*1024 + _small_blob_threshold = 75*1024 + _sticky_cache_size = 300*1024*1024 _sticky_flushed_size = 100*1024*1024 def __init__(self, info=None, verbose=False, inventory_cache_size=10): @@ -74,9 +75,14 @@ class CacheManager(object): # if we overflow our memory cache, then we will dump large blobs to # disk in this directory self._tempdir = None - # id => TemporaryFile + # id => (offset, n_bytes, fname) + # if fname is None, then the content is stored in the small file self._disk_blobs = {} self._cleanup = _Cleanup(self._disk_blobs) + # atexit.register(self._cleanup.finalize) + # The main problem is that it won't let cleanup go away 'normally', so + # we really need a weakref callback... + # Perhaps just registering the shutil.rmtree? # revision-id -> Inventory cache # these are large and we probably don't need too many as @@ -154,12 +160,13 @@ class CacheManager(object): def _flush_blobs_to_disk(self): blobs = self._sticky_blobs.keys() sticky_blobs = self._sticky_blobs + total_blobs = len(sticky_blobs) blobs.sort(key=lambda k:len(sticky_blobs[k])) if self._tempdir is None: self._tempdir = tempfile.mkdtemp(prefix='bzr_fastimport_blobs-') self._cleanup.tempdir = self._tempdir self._cleanup.small_blobs = tempfile.TemporaryFile( - prefix='small-blobs-') + prefix='small-blobs-', dir=self._tempdir) count = 0 bytes = 0 n_small_bytes = 0 @@ -171,17 +178,19 @@ class CacheManager(object): if n_bytes < self._small_blob_threshold: f = self._cleanup.small_blobs f.seek(0, os.SEEK_END) - self._disk_blobs[id] = (True, f.tell(), n_bytes, f) + self._disk_blobs[id] = (f.tell(), n_bytes, None) + f.write(blob) n_small_bytes += n_bytes else: - f = tempfile.TemporaryFile(prefix='blob-', dir=self._tempdir) - self._disk_blobs[id] = (False, 0, n_bytes, f) - f.write(blob) + fd, name = tempfile.mkstemp(prefix='blob-', dir=self._tempdir) + os.write(fd, blob) + os.close(fd) + self._disk_blobs[id] = (0, n_bytes, name) bytes += n_bytes del blob count += 1 - trace.note('flushed %d blobs w/ %.1fMB (%.1fMB small) to disk' - % (count, bytes / 1024. / 1024, + trace.note('flushed %d/%d blobs w/ %.1fMB (%.1fMB small) to disk' + % (count, total_blobs, bytes / 1024. / 1024, n_small_bytes / 1024. / 1024)) @@ -199,33 +208,43 @@ class CacheManager(object): else: self._blobs[id] = data - def _decref(self, id, cache, f): + def _decref(self, id, cache, fn): if not self._blob_ref_counts: - return + return False count = self._blob_ref_counts.get(id, None) if count is not None: count -= 1 if count <= 0: del cache[id] - if f is not None: - f.close() + if fn is not None: + os.unlink(fn) del self._blob_ref_counts[id] + return True else: self._blob_ref_counts[id] = count + return False def fetch_blob(self, id): """Fetch a blob of data.""" if id in self._blobs: return self._blobs.pop(id) if id in self._disk_blobs: - (is_small, offset, n_bytes, f) = self._disk_blobs[id] - f.seek(offset) - content = f.read(n_bytes) - self._decref(id, self._disk_blobs, f) + (offset, n_bytes, fn) = self._disk_blobs[id] + if fn is None: + f = self._cleanup.small_blobs + f.seek(offset) + content = f.read(n_bytes) + else: + fp = open(fn, 'rb') + try: + content = fp.read() + finally: + fp.close() + self._decref(id, self._disk_blobs, fn) return content content = self._sticky_blobs[id] - self._sticky_memory_bytes -= len(content) - self._decref(id, self._sticky_blobs, None) + if self._decref(id, self._sticky_blobs, None): + self._sticky_memory_bytes -= len(content) return content def track_heads(self, cmd): |