From 81ac975fe2bdcd95de1faed9a318f3867aa95059 Mon Sep 17 00:00:00 2001 From: bescoto Date: Sat, 24 Dec 2005 20:02:05 +0000 Subject: Reduce hardlink memory usage git-svn-id: http://svn.savannah.nongnu.org/svn/rdiff-backup/trunk@719 2b77aa54-bcbc-44c9-a7ec-4f6cf2b41109 --- rdiff-backup/CHANGELOG | 3 +++ rdiff-backup/rdiff-backup.1 | 7 ++----- rdiff-backup/rdiff_backup/Hardlink.py | 28 +++++++++++++++++----------- rdiff-backup/rdiff_backup/backup.py | 19 ++++++------------- rdiff-backup/testing/commontest.py | 17 ++++++++++++++++- 5 files changed, 44 insertions(+), 30 deletions(-) diff --git a/rdiff-backup/CHANGELOG b/rdiff-backup/CHANGELOG index 8c9aa16..4766486 100644 --- a/rdiff-backup/CHANGELOG +++ b/rdiff-backup/CHANGELOG @@ -14,6 +14,9 @@ Added supplementary rdiff-backup-statistics utility for parsing rdiff-backup's statistics files (originally based off perl script by Dean Gaudet). +rdiff-backup should now use much less memory than v1.1.1-1.1.4 if you +have lots of hard links. + New in v1.1.4 (2005/12/13) -------------------------- diff --git a/rdiff-backup/rdiff-backup.1 b/rdiff-backup/rdiff-backup.1 index e785e98..e13a9f0 100644 --- a/rdiff-backup/rdiff-backup.1 +++ b/rdiff-backup/rdiff-backup.1 @@ -314,11 +314,8 @@ rdiff-backup-data directory. rdiff-backup will run slightly quicker and take up a bit less space. .TP .BI --no-hard-links -Don't replicate hard links on destination side. Note that because -metadata is written to a separate file, hard link information will not -be lost even if the --no-hard-links option is given (however, mirror -files will not be linked). If many hard-linked files are present, -this option can drastically decrease memory usage. +Don't replicate hard links on destination side. If many hard-linked +files are present, this option can drastically decrease memory usage. .TP .B --null-separator Use nulls (\\0) instead of newlines (\\n) as line separators, which diff --git a/rdiff-backup/rdiff_backup/Hardlink.py b/rdiff-backup/rdiff_backup/Hardlink.py index 855c512..1dddbbb 100644 --- a/rdiff-backup/rdiff_backup/Hardlink.py +++ b/rdiff-backup/rdiff_backup/Hardlink.py @@ -31,15 +31,15 @@ source side should only transmit inode information. """ from __future__ import generators -import cPickle -import Globals, Time, rpath, log, robust, errno +import Globals, Time, log, robust, errno # The keys in this dictionary are (inode, devloc) pairs. The values -# are a pair (index, remaining_links, dest_key) where index is the -# rorp index of the first such linked file, remaining_links is the -# number of files hard linked to this one we may see, and key is +# are a pair (index, remaining_links, dest_key, sha1sum) where index +# is the rorp index of the first such linked file, remaining_links is +# the number of files hard linked to this one we may see, and key is # either (dest_inode, dest_devloc) or None, and represents the -# hardlink info of the existing file on the destination. +# hardlink info of the existing file on the destination. Finally +# sha1sum is the hash of the file if it exists, or None. _inode_index = None def initialize_dictionaries(): @@ -64,7 +64,9 @@ def add_rorp(rorp, dest_rorp = None): if not dest_rorp: dest_key = None elif dest_rorp.getnumlinks() == 1: dest_key = "NA" else: dest_key = get_inode_key(dest_rorp) - _inode_index[rp_inode_key] = (rorp.index, rorp.getnumlinks(), dest_key) + digest = rorp.has_sha1() and rorp.get_sha1() or None + _inode_index[rp_inode_key] = (rorp.index, rorp.getnumlinks(), + dest_key, digest) return rp_inode_key def del_rorp(rorp): @@ -73,12 +75,12 @@ def del_rorp(rorp): rp_inode_key = get_inode_key(rorp) val = _inode_index.get(rp_inode_key) if not val: return - index, remaining, dest_key = val + index, remaining, dest_key, digest = val if remaining == 1: del _inode_index[rp_inode_key] return 1 else: - _inode_index[rp_inode_key] = (index, remaining-1, dest_key) + _inode_index[rp_inode_key] = (index, remaining-1, dest_key, digest) return 0 def rorp_eq(src_rorp, dest_rorp): @@ -95,11 +97,11 @@ def rorp_eq(src_rorp, dest_rorp): if src_rorp.getnumlinks() < dest_rorp.getnumlinks(): return 0 src_key = get_inode_key(src_rorp) - index, remaining, dest_key = _inode_index[src_key] + index, remaining, dest_key, digest = _inode_index[src_key] if dest_key == "NA": # Allow this to be ok for first comparison, but not any # subsequent ones - _inode_index[src_key] = (index, remaining, None) + _inode_index[src_key] = (index, remaining, None, None) return 1 return dest_key == get_inode_key(dest_rorp) @@ -114,6 +116,10 @@ def get_link_index(rorp): """Return first index on target side rorp is already linked to""" return _inode_index[get_inode_key(rorp)][0] +def get_sha1(rorp): + """Return sha1 digest of what rorp is linked to""" + return _inode_index[get_inode_key(rorp)][3] + def link_rp(diff_rorp, dest_rpath, dest_root = None): """Make dest_rpath into a link using link flag in diff_rorp""" if not dest_root: dest_root = dest_rpath # use base of dest_rpath diff --git a/rdiff-backup/rdiff_backup/backup.py b/rdiff-backup/rdiff_backup/backup.py index 819ae91..181c918 100644 --- a/rdiff-backup/rdiff_backup/backup.py +++ b/rdiff-backup/rdiff_backup/backup.py @@ -295,11 +295,6 @@ class CacheCollatedPostProcess: # after we're finished with them self.dir_perms_list = [] - # A dictionary of {index: source_rorp}. We use this to - # hold the digest of a hard linked file so it only needs to be - # computed once. - self.inode_digest_dict = {} - # Contains list of (index, (source_rorp, diff_rorp)) pairs for # the parent directories of the last item in the cache. self.parent_list = [] @@ -326,8 +321,7 @@ class CacheCollatedPostProcess: """ if Globals.preserve_hardlinks and source_rorp: - if Hardlink.add_rorp(source_rorp, dest_rorp): - self.inode_digest_dict[source_rorp.index] = source_rorp + Hardlink.add_rorp(source_rorp, dest_rorp) if (dest_rorp and dest_rorp.isdir() and Globals.process_uid != 0 and dest_rorp.getperms() % 01000 < 0700): self.unreadable_dir_init(source_rorp, dest_rorp) @@ -394,8 +388,7 @@ class CacheCollatedPostProcess: """ if Globals.preserve_hardlinks and source_rorp: - if Hardlink.del_rorp(source_rorp): - del self.inode_digest_dict[source_rorp.index] + Hardlink.del_rorp(source_rorp) if not changed or success: if source_rorp: self.statfileobj.add_source_file(source_rorp) @@ -469,10 +462,10 @@ class CacheCollatedPostProcess: def update_hardlink_hash(self, diff_rorp): """Tag associated source_rorp with same hash diff_rorp points to""" - orig_rorp = self.inode_digest_dict[diff_rorp.get_link_flag()] - if orig_rorp.has_sha1(): - new_source_rorp = self.get_source_rorp(diff_rorp.index) - new_source_rorp.set_sha1(orig_rorp.get_sha1()) + sha1sum = Hardlink.get_sha1(diff_rorp) + if not sha1sum: return + source_rorp = self.get_source_rorp(diff_rorp.index) + source_rorp.set_sha1(sha1sum) def close(self): """Process the remaining elements in the cache""" diff --git a/rdiff-backup/testing/commontest.py b/rdiff-backup/testing/commontest.py index 04cdb93..e9b272e 100644 --- a/rdiff-backup/testing/commontest.py +++ b/rdiff-backup/testing/commontest.py @@ -386,4 +386,19 @@ def raise_interpreter(use_locals = None): else: local_dict = globals() code.InteractiveConsole(local_dict).interact() - +def getrefs(i, depth): + """Get the i'th object in memory, return objects that reference it""" + import sys, gc, types + o = sys.getobjects(i)[-1] + for d in range(depth): + for ref in gc.get_referrers(o): + if type(ref) in (types.ListType, types.DictType, + types.InstanceType): + if type(ref) is types.DictType and ref.has_key('copyright'): + continue + o = ref + break + else: + print "Max depth ", d + return o + return o -- cgit v1.2.1