From 5ccca65dec1ef00d3551493958af5ee27e7df212 Mon Sep 17 00:00:00 2001 From: bescoto Date: Sun, 12 Oct 2003 05:24:25 +0000 Subject: New hard link system should use less memory git-svn-id: http://svn.savannah.nongnu.org/svn/rdiff-backup/trunk@468 2b77aa54-bcbc-44c9-a7ec-4f6cf2b41109 --- rdiff-backup/CHANGELOG | 2 + rdiff-backup/rdiff_backup/Hardlink.py | 150 ++++++++++------------------------ rdiff-backup/rdiff_backup/backup.py | 13 +-- rdiff-backup/rdiff_backup/restore.py | 12 +-- rdiff-backup/rdiff_backup/rpath.py | 1 + rdiff-backup/testing/commontest.py | 129 ++++++++++++++--------------- rdiff-backup/testing/hardlinktest.py | 58 +++---------- 7 files changed, 135 insertions(+), 230 deletions(-) diff --git a/rdiff-backup/CHANGELOG b/rdiff-backup/CHANGELOG index 7a7c0f1..6c714ef 100644 --- a/rdiff-backup/CHANGELOG +++ b/rdiff-backup/CHANGELOG @@ -18,6 +18,8 @@ that contain unreadable files and directories as long as they are owned by that user. Bug report by Arkadiusz Miskiewicz. Hopefully this is the last of the unreadable file bugs... +Rewrote hard link tracking system. New way should use less memory. + New in v0.13.2 (2003/09/16) --------------------------- diff --git a/rdiff-backup/rdiff_backup/Hardlink.py b/rdiff-backup/rdiff_backup/Hardlink.py index 453672e..81472b2 100644 --- a/rdiff-backup/rdiff_backup/Hardlink.py +++ b/rdiff-backup/rdiff_backup/Hardlink.py @@ -34,137 +34,80 @@ from __future__ import generators import cPickle import Globals, Time, rpath, log, robust -# In all of these lists of indicies are the values. The keys in -# _inode_ ones are (inode, devloc) pairs. -_src_inode_indicies = None -_dest_inode_indicies = None - -# The keys for these two are just indicies. They share values -# with the earlier dictionaries. -_src_index_indicies = None -_dest_index_indicies = None - -# When a linked file is restored, its path is added to this dict, -# so it can be found when later paths being restored are linked to -# it. -_restore_index_path = None +# The keys in this dictionary are (inode, devloc) pairs. The values +# are a pair (index, remaining_links, dest_key) where index is the +# rorp index of the first such linked file, remaining_links is the +# number of files hard linked to this one we may see, and key is +# either (dest_inode, dest_devloc) or None, and represents the +# hardlink info of the existing file on the destination. +_inode_index = None def initialize_dictionaries(): """Set all the hard link dictionaries to empty""" - global _src_inode_indicies, _dest_inode_indicies - global _src_index_indicies, _dest_index_indicies, _restore_index_path - _src_inode_indicies = {} - _dest_inode_indicies = {} - _src_index_indicies = {} - _dest_index_indicies = {} - _restore_index_path = {} + global _inode_index + _inode_index = {} def clear_dictionaries(): """Delete all dictionaries""" - global _src_inode_indicies, _dest_inode_indicies - global _src_index_indicies, _dest_index_indicies, _restore_index_path - _src_inode_indicies = _dest_inode_indicies = None - _src_index_indicies = _dest_index_indicies = _restore_index_path = None - + global _inode_index + _inode_index = None def get_inode_key(rorp): """Return rorp's key for _inode_ dictionaries""" return (rorp.getinode(), rorp.getdevloc()) -def get_indicies(rorp, source): - """Return a list of similarly linked indicies, using rorp's index""" - if source: dict = _src_index_indicies - else: dict = _dest_index_indicies - try: return dict[rorp.index] - except KeyError: return [] - -def add_rorp(rorp, source): - """Process new rorp and update hard link dictionaries - - First enter it into src_inode_indicies. If we have already - seen all the hard links, then we can delete the entry. - Everything must stay recorded in src_index_indicies though. - - """ +def add_rorp(rorp, dest_rorp = None): + """Process new rorp and update hard link dictionaries""" + if not rorp.isreg() or rorp.getnumlinks() < 2: return + rp_inode_key = get_inode_key(rorp) + if not _inode_index.has_key(rp_inode_key): + if not dest_rorp: dest_key = None + elif dest_rorp.getnumlinks() == 1: dest_key = "NA" + else: dest_key = get_inode_key(dest_rorp) + _inode_index[rp_inode_key] = (rorp.index, rorp.getnumlinks(), dest_key) + +def del_rorp(rorp): + """Remove rorp information from dictionary if seen all links""" if not rorp.isreg() or rorp.getnumlinks() < 2: return - - if source: - inode_dict, index_dict = _src_inode_indicies, _src_index_indicies - else: inode_dict, index_dict = _dest_inode_indicies, _dest_index_indicies - rp_inode_key = get_inode_key(rorp) - if inode_dict.has_key(rp_inode_key): - index_list = inode_dict[rp_inode_key] - index_list.append(rorp.index) - if len(index_list) == rorp.getnumlinks(): - del inode_dict[rp_inode_key] - else: # make new entry in both src dicts - index_list = [rorp.index] - inode_dict[rp_inode_key] = index_list - index_dict[rorp.index] = index_list - -def add_rorp_iter(iter, source): - """Return new rorp iterator like iter that add_rorp's first""" - for rorp in iter: - add_rorp(rorp, source) - yield rorp + val = _inode_index.get(rp_inode_key) + if not val: return + index, remaining, dest_key = val + if remaining == 1: del _inode_index[rp_inode_key] + else: _inode_index[rp_inode_key] = (index, remaining-1, dest_key) def rorp_eq(src_rorp, dest_rorp): """Compare hardlinked for equality - Two files may otherwise seem equal but be hardlinked in - different ways. This function considers them equal enough if - they have been hardlinked correctly to the previously seen - indicies. + Return false if dest_rorp is linked differently, which can happen + if dest is linked more than source, or if it is represented by a + different inode. """ if (not src_rorp.isreg() or not dest_rorp.isreg() or src_rorp.getnumlinks() == dest_rorp.getnumlinks() == 1): return 1 # Hard links don't apply - src_index_list = get_indicies(src_rorp, 1) - dest_index_list = get_indicies(dest_rorp, None) - - # If a list only has one element, then it is only hardlinked - # to itself so far, so that is not a genuine difference yet. - if not src_index_list or len(src_index_list) == 1: - return not dest_index_list or len(dest_index_list) == 1 - if not dest_index_list or len(dest_index_list) == 1: return None - - # Both index lists exist and are non-empty - return src_index_list == dest_index_list # they are always sorted + if src_rorp.getnumlinks() < dest_rorp.getnumlinks(): return 0 + src_key = get_inode_key(src_rorp) + index, remaining, dest_key = _inode_index[src_key] + if dest_key == "NA": + # Allow this to be ok for first comparison, but not any + # subsequent ones + _inode_index[src_key] = (index, remaining, None) + return 1 + return dest_key == get_inode_key(dest_rorp) def islinked(rorp): """True if rorp's index is already linked to something on src side""" - return len(get_indicies(rorp, 1)) >= 2 + if not rorp.getnumlinks() > 1: return 0 + dict_val = _inode_index.get(get_inode_key(rorp)) + if not dict_val: return 0 + return dict_val[0] != rorp.index # If equal, then rorp is first def get_link_index(rorp): """Return first index on target side rorp is already linked to""" - return get_indicies(rorp, 1)[0] - -def restore_link(index, rpath): - """Restores a linked file by linking it - - When restoring, all the hardlink data is already present, and - we can only link to something already written. In either - case, add to the _restore_index_path dict, so we know later - that the file is available for hard - linking. - - Returns true if succeeded in creating rpath, false if must - restore rpath normally. - - """ - if index not in _src_index_indicies: return None - for linked_index in _src_index_indicies[index]: - if linked_index in _restore_index_path: - srcpath = _restore_index_path[linked_index] - log.Log("Restoring %s by hard linking to %s" % - (rpath.path, srcpath), 6) - rpath.hardlink(srcpath) - return 1 - _restore_index_path[index] = rpath.path - return None + return _inode_index[get_inode_key(rorp)][0] def link_rp(diff_rorp, dest_rpath, dest_root = None): """Make dest_rpath into a link using link flag in diff_rorp""" @@ -172,6 +115,3 @@ def link_rp(diff_rorp, dest_rpath, dest_root = None): dest_link_rpath = dest_root.new_index(diff_rorp.get_link_flag()) dest_rpath.hardlink(dest_link_rpath.path) - - - diff --git a/rdiff-backup/rdiff_backup/backup.py b/rdiff-backup/rdiff_backup/backup.py index a89f21e..850e266 100644 --- a/rdiff-backup/rdiff_backup/backup.py +++ b/rdiff-backup/rdiff_backup/backup.py @@ -180,10 +180,10 @@ class DestinationStruct: def get_one_sig(cls, dest_base_rpath, index, src_rorp, dest_rorp): """Return a signature given source and destination rorps""" - if (Globals.preserve_hardlinks and - Hardlink.islinked(src_rorp or dest_rorp)): + if (Globals.preserve_hardlinks and src_rorp and + Hardlink.islinked(src_rorp)): dest_sig = rpath.RORPath(index) - dest_sig.flaglinked(Hardlink.get_link_index(dest_sig)) + dest_sig.flaglinked(Hardlink.get_link_index(src_rorp)) elif dest_rorp: dest_sig = dest_rorp.getRORPath() if dest_rorp.isreg(): @@ -312,8 +312,8 @@ class CacheCollatedPostProcess: will be backed up correctly. """ - if source_rorp: Hardlink.add_rorp(source_rorp, source = 1) - if dest_rorp: Hardlink.add_rorp(dest_rorp, source = 0) + if Globals.preserve_hardlinks and source_rorp: + Hardlink.add_rorp(source_rorp, dest_rorp) if (dest_rorp and dest_rorp.isdir() and Globals.process_uid != 0 and dest_rorp.getperms() % 01000 < 0700): self.unreadable_dir_init(source_rorp, dest_rorp) @@ -355,6 +355,9 @@ class CacheCollatedPostProcess: always false for un-changed files). """ + if Globals.preserve_hardlinks and source_rorp: + Hardlink.del_rorp(source_rorp) + if not changed or success: if source_rorp: self.statfileobj.add_source_file(source_rorp) if dest_rorp: self.statfileobj.add_dest_file(dest_rorp) diff --git a/rdiff-backup/rdiff_backup/restore.py b/rdiff-backup/rdiff_backup/restore.py index 8d64c36..7a4f0bf 100644 --- a/rdiff-backup/rdiff_backup/restore.py +++ b/rdiff-backup/rdiff_backup/restore.py @@ -235,15 +235,17 @@ class MirrorStruct: def get_diffs_from_collated(cls, collated): """Get diff iterator from collated""" for mir_rorp, target_rorp in collated: - if Globals.preserve_hardlinks: - if mir_rorp: Hardlink.add_rorp(mir_rorp, source = 1) - if target_rorp: Hardlink.add_rorp(target_rorp, source = 0) - + if Globals.preserve_hardlinks and mir_rorp: + Hardlink.add_rorp(mir_rorp, target_rorp) if (not target_rorp or not mir_rorp or not mir_rorp == target_rorp or (Globals.preserve_hardlinks and not Hardlink.rorp_eq(mir_rorp, target_rorp))): - yield cls.get_diff(mir_rorp, target_rorp) + diff = cls.get_diff(mir_rorp, target_rorp) + else: diff = None + if Globals.preserve_hardlinks and mir_rorp: + Hardlink.del_rorp(mir_rorp) + if diff: yield diff def get_diff(cls, mir_rorp, target_rorp): """Get a diff for mir_rorp at time""" diff --git a/rdiff-backup/rdiff_backup/rpath.py b/rdiff-backup/rdiff_backup/rpath.py index 1786dd8..b2f6f20 100644 --- a/rdiff-backup/rdiff_backup/rpath.py +++ b/rdiff-backup/rdiff_backup/rpath.py @@ -761,6 +761,7 @@ class RPath(RORPath): def hardlink(self, linkpath): """Make self into a hardlink joined to linkpath""" + log.Log("Hard linking %s to %s" % (self.path, linkpath), 6) self.conn.os.link(linkpath, self.path) self.setdata() diff --git a/rdiff-backup/testing/commontest.py b/rdiff-backup/testing/commontest.py index e80d699..9d1ee81 100644 --- a/rdiff-backup/testing/commontest.py +++ b/rdiff-backup/testing/commontest.py @@ -3,7 +3,7 @@ import os, sys, code from rdiff_backup.log import Log from rdiff_backup.rpath import RPath from rdiff_backup import Globals, Hardlink, SetConnections, Main, \ - selection, lazy, Time, rpath, eas_acls + selection, lazy, Time, rpath, eas_acls, rorpiter RBBin = "../rdiff-backup" SourceDir = "../rdiff_backup" @@ -175,63 +175,78 @@ def CompareRecursive(src_rp, dest_rp, compare_hardlinks = 1, specified. """ - if compare_hardlinks: reset_hardlink_dicts() - src_rp.setdata() - dest_rp.setdata() - - Log("Comparing %s and %s, hardlinks %s, eas %s, acls %s" % - (src_rp.path, dest_rp.path, compare_hardlinks, - compare_eas, compare_acls), 3) - src_select = selection.Select(src_rp) - dest_select = selection.Select(dest_rp) - - if ignore_tmp_files: - # Ignoring temp files can be useful when we want to check the - # correctness of a backup which aborted in the middle. In - # these cases it is OK to have tmp files lying around. - src_select.add_selection_func(src_select.regexp_get_sf( - ".*rdiff-backup.tmp.[^/]+$", 0)) - dest_select.add_selection_func(dest_select.regexp_get_sf( - ".*rdiff-backup.tmp.[^/]+$", 0)) - - if exclude_rbdir: - src_select.parse_rbdir_exclude() - dest_select.parse_rbdir_exclude() - else: - # include rdiff-backup-data/increments - src_select.add_selection_func(src_select.glob_get_tuple_sf( - ('rdiff-backup-data', 'increments'), 1)) - dest_select.add_selection_func(dest_select.glob_get_tuple_sf( - ('rdiff-backup-data', 'increments'), 1)) - - # but exclude rdiff-backup-data - src_select.add_selection_func(src_select.glob_get_tuple_sf( - ('rdiff-backup-data',), 0)) - dest_select.add_selection_func(dest_select.glob_get_tuple_sf( - ('rdiff-backup-data',), 0)) - - dsiter1, dsiter2 = src_select.set_iter(), dest_select.set_iter() - - def hardlink_equal(src_rorp, dest_rorp): + def get_selection_functions(): + """Return generators of files in source, dest""" + src_rp.setdata() + dest_rp.setdata() + src_select = selection.Select(src_rp) + dest_select = selection.Select(dest_rp) + + if ignore_tmp_files: + # Ignoring temp files can be useful when we want to check the + # correctness of a backup which aborted in the middle. In + # these cases it is OK to have tmp files lying around. + src_select.add_selection_func(src_select.regexp_get_sf( + ".*rdiff-backup.tmp.[^/]+$", 0)) + dest_select.add_selection_func(dest_select.regexp_get_sf( + ".*rdiff-backup.tmp.[^/]+$", 0)) + + if exclude_rbdir: # Exclude rdiff-backup-data directory + src_select.parse_rbdir_exclude() + dest_select.parse_rbdir_exclude() + + return src_select.set_iter(), dest_select.set_iter() + + def preprocess(src_rorp, dest_rorp): + """Initially process src and dest_rorp""" + if compare_hardlinks and src_rorp: + Hardlink.add_rorp(src_rorp, dest_rorp) + + def postprocess(src_rorp, dest_rorp): + """After comparison, process src_rorp and dest_rorp""" + if compare_hardlinks and src_rorp: + Hardlink.del_rorp(src_rorp) + + def equality_func(src_rorp, dest_rorp): + """Combined eq func returns true iff two files compare same""" + if not src_rorp: + Log("Source rorp missing: " + str(dest_rorp), 3) + return 0 + if not dest_rorp: + Log("Dest rorp missing: " + str(src_rorp), 3) + return 0 if not src_rorp.equal_verbose(dest_rorp, compare_ownership = compare_ownership): - return None - if not Hardlink.rorp_eq(src_rorp, dest_rorp): + return 0 + if compare_hardlinks and not Hardlink.rorp_eq(src_rorp, dest_rorp): + Log("Hardlink compare failure", 3) Log("%s: %s" % (src_rorp.index, - Hardlink.get_indicies(src_rorp, 1)), 3) + Hardlink.get_inode_key(src_rorp)), 3) Log("%s: %s" % (dest_rorp.index, - Hardlink.get_indicies(dest_rorp, None)), 3) - return None + Hardlink.get_inode_key(dest_rorp)), 3) + return 0 if compare_eas and not eas_acls.ea_compare_rps(src_rorp, dest_rorp): Log("Different EAs in files %s and %s" % (src_rorp.get_indexpath(), dest_rorp.get_indexpath()), 3) - return None + return 0 if compare_acls and not eas_acls.acl_compare_rps(src_rorp, dest_rorp): Log("Different ACLs in files %s and %s" % (src_rorp.get_indexpath(), dest_rorp.get_indexpath()), 3) - return None + return 0 return 1 + Log("Comparing %s and %s, hardlinks %s, eas %s, acls %s" % + (src_rp.path, dest_rp.path, compare_hardlinks, + compare_eas, compare_acls), 3) + if compare_hardlinks: reset_hardlink_dicts() + src_iter, dest_iter = get_selection_functions() + for src_rorp, dest_rorp in rorpiter.Collate2Iters(src_iter, dest_iter): + preprocess(src_rorp, dest_rorp) + if not equality_func(src_rorp, dest_rorp): return 0 + postprocess(src_rorp, dest_rorp) + return 1 + + def rbdir_equal(src_rorp, dest_rorp): """Like hardlink_equal, but make allowances for data directories""" if not src_rorp.index and not dest_rorp.index: return 1 @@ -263,30 +278,10 @@ def CompareRecursive(src_rp, dest_rp, compare_hardlinks = 1, Hardlink.get_indicies(dest_rorp, None)), 3) return None - if equality_func: result = lazy.Iter.equal(dsiter1, dsiter2, - 1, equality_func) - elif compare_hardlinks: - dsiter1 = Hardlink.add_rorp_iter(dsiter1, 1) - dsiter2 = Hardlink.add_rorp_iter(dsiter2, None) - if exclude_rbdir: - result = lazy.Iter.equal(dsiter1, dsiter2, 1, hardlink_equal) - else: result = lazy.Iter.equal(dsiter1, dsiter2, 1, rbdir_equal) - elif not exclude_rbdir: - result = lazy.Iter.equal(dsiter1, dsiter2, 1, rbdir_equal) - else: result = lazy.Iter.equal(dsiter1, dsiter2, 1, - lambda x, y: x.equal_verbose(y, compare_ownership = compare_ownership)) - - for i in dsiter1: pass # make sure all files processed anyway - for i in dsiter2: pass - return result def reset_hardlink_dicts(): """Clear the hardlink dictionaries""" - Hardlink._src_inode_indicies = {} - Hardlink._src_index_indicies = {} - Hardlink._dest_inode_indicies = {} - Hardlink._dest_index_indicies = {} - Hardlink._restore_index_path = {} + Hardlink._inode_index = {} def BackupRestoreSeries(source_local, dest_local, list_of_dirnames, compare_hardlinks = 1, diff --git a/rdiff-backup/testing/hardlinktest.py b/rdiff-backup/testing/hardlinktest.py index 22dc31d..c3827f6 100644 --- a/rdiff-backup/testing/hardlinktest.py +++ b/rdiff-backup/testing/hardlinktest.py @@ -2,7 +2,7 @@ import os, unittest, time from commontest import * from rdiff_backup import Globals, Hardlink, selection, rpath -Log.setverbosity(3) +Log.setverbosity(6) class HardlinkTest(unittest.TestCase): """Test cases for Hard links""" @@ -34,62 +34,24 @@ class HardlinkTest(unittest.TestCase): Globals.preserve_hardlinks = 1 reset_hardlink_dicts() for dsrp in selection.Select(self.hardlink_dir3).set_iter(): - Hardlink.add_rorp(dsrp, 1) + Hardlink.add_rorp(dsrp) - assert len(Hardlink._src_inode_indicies.keys()) == 3, \ - Hardlink._src_inode_indicies - assert len(Hardlink._src_index_indicies.keys()) == 3, \ - Hardlink._src_index_indicies - vals1 = Hardlink._src_inode_indicies.values() - vals2 = Hardlink._src_index_indicies.values() - vals1.sort() - vals2.sort() - assert vals1 == vals2 - - def testBuildingDict2(self): - """Same as testBuildingDict but test destination building""" - Globals.preserve_hardlinks = 1 - reset_hardlink_dicts() - for dsrp in selection.Select(self.hardlink_dir3).set_iter(): - Hardlink.add_rorp(dsrp, None) - - assert len(Hardlink._dest_inode_indicies.keys()) == 3, \ - Hardlink._dest_inode_indicies - assert len(Hardlink._dest_index_indicies.keys()) == 3, \ - Hardlink._dest_index_indicies - vals1 = Hardlink._dest_inode_indicies.values() - vals2 = Hardlink._dest_index_indicies.values() - vals1.sort() - vals2.sort() - assert vals1 == vals2 + assert len(Hardlink._inode_index.keys()) == 3, \ + Hardlink._inode_index def testCompletedDict(self): """See if the hardlink dictionaries are built correctly""" reset_hardlink_dicts() for dsrp in selection.Select(self.hardlink_dir1).set_iter(): - Hardlink.add_rorp(dsrp, 1) - assert Hardlink._src_inode_indicies == {}, \ - Hardlink._src_inode_indicies - - hll1 = [('file1',), ('file2',), ('file3',)] - hll2 = [('file4',), ('file5',), ('file6',)] - dict = {} - for index in hll1: dict[index] = hll1 - for index in hll2: dict[index] = hll2 - assert Hardlink._src_index_indicies == dict + Hardlink.add_rorp(dsrp) + Hardlink.del_rorp(dsrp) + assert Hardlink._inode_index == {}, Hardlink._inode_index reset_hardlink_dicts() for dsrp in selection.Select(self.hardlink_dir2).set_iter(): - Hardlink.add_rorp(dsrp, 1) - assert Hardlink._src_inode_indicies == {}, \ - Hardlink._src_inode_indicies - - hll1 = [('file1',), ('file3',), ('file4',)] - hll2 = [('file2',), ('file5',), ('file6',)] - dict = {} - for index in hll1: dict[index] = hll1 - for index in hll2: dict[index] = hll2 - assert Hardlink._src_index_indicies == dict + Hardlink.add_rorp(dsrp) + Hardlink.del_rorp(dsrp) + assert Hardlink._inode_index == {}, Hardlink._inode_index def testSeries(self): """Test hardlink system by backing up and restoring a few dirs""" -- cgit v1.2.1