From 42aa5f8cf76abd0a535f9205a8376dcb702a2a76 Mon Sep 17 00:00:00 2001 From: bescoto Date: Sat, 15 Mar 2003 19:30:08 +0000 Subject: Final changes for 0.11.4 git-svn-id: http://svn.savannah.nongnu.org/svn/rdiff-backup/trunk@301 2b77aa54-bcbc-44c9-a7ec-4f6cf2b41109 --- rdiff-backup/CHANGELOG | 10 ++++- rdiff-backup/TODO | 3 +- rdiff-backup/rdiff-backup.1 | 5 +++ rdiff-backup/rdiff_backup/Globals.py | 4 ++ rdiff-backup/rdiff_backup/Main.py | 16 +++---- rdiff-backup/rdiff_backup/backup.py | 51 +++++++++++++++------- rdiff-backup/rdiff_backup/metadata.py | 34 +++++++-------- rdiff-backup/rdiff_backup/rpath.py | 4 +- rdiff-backup/rdiff_backup/statistics.py | 65 ++++++++++++++++++++++++++++- rdiff-backup/testing/metadatatest.py | 2 +- rdiff-backup/testing/statisticstest.py | 22 +--------- rdiff-backup/testing/test_with_profiling.py | 7 ++++ 12 files changed, 155 insertions(+), 68 deletions(-) create mode 100644 rdiff-backup/testing/test_with_profiling.py (limited to 'rdiff-backup') diff --git a/rdiff-backup/CHANGELOG b/rdiff-backup/CHANGELOG index 738b31c..5e7fd84 100644 --- a/rdiff-backup/CHANGELOG +++ b/rdiff-backup/CHANGELOG @@ -1,4 +1,4 @@ -New in v0.11.4 (2003/04/01) +New in v0.11.4 (2003/03/15) --------------------------- Fixed bug incrementing sockets whose filenames were pretty long, but @@ -7,6 +7,14 @@ not super long. Reported by Olivier Mueller. Added Albert Chin-A-Young's patch to add a few options to the setup.py install script. +Apparently fixed rare utime type bug. Thanks to Christian Skarby for +report and testing. + +Added detailed file_statistics (in addition to session_statistics) as +requested by Dean Gaudet. Disable with --no-file-statistics option. + +Minor speed enhancements. + New in v0.11.3 (2003/03/04) --------------------------- diff --git a/rdiff-backup/TODO b/rdiff-backup/TODO index ca48674..51bb553 100644 --- a/rdiff-backup/TODO +++ b/rdiff-backup/TODO @@ -1,3 +1,4 @@ + ---------[ Medium term ]--------------------------------------- Look at Kent Borg's suggestion for restore options and digests. @@ -13,8 +14,6 @@ Add # of increments option to --remove-older-than Make argument shortcut for cstream -Make --calculate-averages work with directory_statistics file. - Write configuration file, to make sure settings like --quoting-char, --windows-time-format, etc., don't change between sessions, backup/restoring, etc. diff --git a/rdiff-backup/rdiff-backup.1 b/rdiff-backup/rdiff-backup.1 index 0c5f2b4..8597da9 100644 --- a/rdiff-backup/rdiff-backup.1 +++ b/rdiff-backup/rdiff-backup.1 @@ -218,6 +218,11 @@ Do not compress increments based on files whose filenames match regexp. The default is "(?i).*\\.(gz|z|bz|bz2|tgz|zip|rpm|deb|jpg|gif|png|jp2|mp3|ogg|avi|wmv|mpeg|mpg|rm|mov)$" .TP +.B --no-file-statistics +This will disable writing to the file_statistics file in the +rdiff-backup-data directory. rdiff-backup will run slightly quicker +and take up a bit less space. +.TP .BI --no-hard-links Don't replicate hard links on destination side. Note that because metadata is written to a separate file, hard link information will not diff --git a/rdiff-backup/rdiff_backup/Globals.py b/rdiff-backup/rdiff_backup/Globals.py index 359e351..396ab3b 100644 --- a/rdiff-backup/rdiff_backup/Globals.py +++ b/rdiff-backup/rdiff_backup/Globals.py @@ -148,6 +148,10 @@ ssh_compression = 1 # If true, print statistics after successful backup print_statistics = None +# Controls whether file_statistics file is written in +# rdiff-backup-data dir. These can sometimes take up a lot of space. +file_statistics = 1 + # On the writer connection, the following will be set to the mirror # Select iterator. select_mirror = None diff --git a/rdiff-backup/rdiff_backup/Main.py b/rdiff-backup/rdiff_backup/Main.py index 391e848..5381272 100644 --- a/rdiff-backup/rdiff_backup/Main.py +++ b/rdiff-backup/rdiff_backup/Main.py @@ -53,13 +53,14 @@ def parse_cmdlineoptions(arglist): "include-globbing-filelist=", "include-regexp=", "list-changed-since=", "list-increments", "no-compare-inode", "no-compression", - "no-compression-regexp=", "no-hard-links", "null-separator", - "parsable-output", "print-statistics", "quoting-char=", - "remote-cmd=", "remote-schema=", "remove-older-than=", - "restore-as-of=", "restrict=", "restrict-read-only=", - "restrict-update-only=", "server", "ssh-no-compression", - "terminal-verbosity=", "test-server", "verbosity=", - "version", "windows-mode", "windows-time-format"]) + "no-compression-regexp=", "no-file-statistics", + "no-hard-links", "null-separator", "parsable-output", + "print-statistics", "quoting-char=", "remote-cmd=", + "remote-schema=", "remove-older-than=", "restore-as-of=", + "restrict=", "restrict-read-only=", "restrict-update-only=", + "server", "ssh-no-compression", "terminal-verbosity=", + "test-server", "verbosity=", "version", "windows-mode", + "windows-time-format"]) except getopt.error, e: commandline_error("Bad commandline options: %s" % str(e)) @@ -108,6 +109,7 @@ def parse_cmdlineoptions(arglist): elif opt == "--no-compression": Globals.set("compression", None) elif opt == "--no-compression-regexp": Globals.set("no_compression_regexp_string", arg) + elif opt == "--no-file-statistics": Globals.set('file_statistics', 0) elif opt == "--no-hard-links": Globals.set('preserve_hardlinks', 0) elif opt == "--null-separator": Globals.set("null_separator", 1) elif opt == "--parsable-output": Globals.set('parsable_output', 1) diff --git a/rdiff-backup/rdiff_backup/backup.py b/rdiff-backup/rdiff_backup/backup.py index 0ff5ace..6efd8a3 100644 --- a/rdiff-backup/rdiff_backup/backup.py +++ b/rdiff-backup/rdiff_backup/backup.py @@ -237,14 +237,21 @@ class CacheCollatedPostProcess: self.iter = collated_iter # generates (source_rorp, dest_rorp) pairs self.cache_size = cache_size self.statfileobj = statistics.init_statfileobj() + if Globals.file_statistics: statistics.FileStats.init() metadata.OpenMetadata() - # the following should map indicies to lists [source_rorp, - # dest_rorp, changed_flag, success_flag] where changed_flag - # should be true if the rorps are different, and success_flag - # should be 1 if dest_rorp has been successfully updated to - # source_rorp, and 2 if the destination file is deleted - # entirely. They both default to false (0). + # the following should map indicies to lists + # [source_rorp, dest_rorp, changed_flag, success_flag, increment] + + # changed_flag should be true if the rorps are different, and + + # success_flag should be 1 if dest_rorp has been successfully + # updated to source_rorp, and 2 if the destination file is + # deleted entirely. They both default to false (0). + + # increment holds the RPath of the increment file if one + # exists. It is used to record file statistics. + self.cache_dict = {} self.cache_indicies = [] @@ -255,7 +262,7 @@ class CacheCollatedPostProcess: source_rorp, dest_rorp = self.iter.next() self.pre_process(source_rorp, dest_rorp) index = source_rorp and source_rorp.index or dest_rorp.index - self.cache_dict[index] = [source_rorp, dest_rorp, 0, 0] + self.cache_dict[index] = [source_rorp, dest_rorp, 0, 0, None] self.cache_indicies.append(index) if len(self.cache_indicies) > self.cache_size: self.shorten_cache() @@ -276,15 +283,17 @@ class CacheCollatedPostProcess: """Remove one element from cache, possibly adding it to metadata""" first_index = self.cache_indicies[0] del self.cache_indicies[0] - old_source_rorp, old_dest_rorp, changed_flag, success_flag = \ + old_source_rorp, old_dest_rorp, changed_flag, success_flag, inc = \ self.cache_dict[first_index] del self.cache_dict[first_index] self.post_process(old_source_rorp, old_dest_rorp, - changed_flag, success_flag) + changed_flag, success_flag, inc) - def post_process(self, source_rorp, dest_rorp, changed, success): + def post_process(self, source_rorp, dest_rorp, changed, success, inc): """Post process source_rorp and dest_rorp. + The point of this is to write statistics and metadata. + changed will be true if the files have changed. success will be true if the files have been successfully updated (this is always false for un-changed files). @@ -294,12 +303,14 @@ class CacheCollatedPostProcess: if source_rorp: self.statfileobj.add_source_file(source_rorp) if dest_rorp: self.statfileobj.add_dest_file(dest_rorp) if success == 0: metadata_rorp = dest_rorp - elif success == 1: + elif success == 1 or success == 2: self.statfileobj.add_changed(source_rorp, dest_rorp) metadata_rorp = source_rorp else: metadata_rorp = None if metadata_rorp and metadata_rorp.lstat(): metadata.WriteMetadata(metadata_rorp) + if Globals.file_statistics: + statistics.FileStats.update(source_rorp, dest_rorp, changed, inc) def in_cache(self, index): """Return true if given index is cached""" @@ -317,6 +328,10 @@ class CacheCollatedPostProcess: """Signal that the file with given index has changed""" self.cache_dict[index][2] = 1 + def set_inc(self, index, inc): + """Set the increment of the current file""" + self.cache_dict[index][4] = inc + def get_rorps(self, index): """Retrieve (source_rorp, dest_rorp) from cache""" return self.cache_dict[index][:2] @@ -337,6 +352,7 @@ class CacheCollatedPostProcess: while self.cache_indicies: self.shorten_cache() metadata.CloseMetadata() if Globals.print_statistics: statistics.print_active_stats() + if Globals.file_statistics: statistics.FileStats.close() statistics.write_active_statfileobj() @@ -511,6 +527,7 @@ class IncrementITRB(PatchITRB): if self.patch_to_temp(rp, diff_rorp, tf): inc = self.inc_with_checking(tf, rp, self.get_incrp(index)) if inc is not None: + self.CCPP.set_inc(index, inc) if inc.isreg(): inc.fsync_with_dir() # Write inc before rp changed if tf.lstat(): @@ -531,10 +548,12 @@ class IncrementITRB(PatchITRB): inc = self.inc_with_checking(diff_rorp, base_rp, self.get_incrp(index)) if inc and inc.isreg(): - inc.fsync_with_dir() # must writte inc before rp changed + inc.fsync_with_dir() # must write inc before rp changed self.prepare_dir(diff_rorp, base_rp) - elif (self.set_dir_replacement(diff_rorp, base_rp) and - self.inc_with_checking(self.dir_replacement, base_rp, - self.get_incrp(index))): - self.CCPP.flag_success(index) + elif self.set_dir_replacement(diff_rorp, base_rp): + inc = self.inc_with_checking(self.dir_replacement, base_rp, + self.get_incrp(index)) + if inc: + self.CCPP.set_inc(index, inc) + self.CCPP.flag_success(index) diff --git a/rdiff-backup/rdiff_backup/metadata.py b/rdiff-backup/rdiff_backup/metadata.py index a434024..f1f23ac 100644 --- a/rdiff-backup/rdiff_backup/metadata.py +++ b/rdiff-backup/rdiff_backup/metadata.py @@ -104,26 +104,20 @@ def RORP2Record(rorpath): str_list.append(" Permissions %s\n" % rorpath.getperms()) return "".join(str_list) -line_parsing_regexp = re.compile("^ *([A-Za-z0-9]+) (.+)$") +line_parsing_regexp = re.compile("^ *([A-Za-z0-9]+) (.+)$", re.M) def Record2RORP(record_string): """Given record_string, return RORPath For speed reasons, write the RORPath data dictionary directly - instead of calling rorpath functions. This depends on the + instead of calling rorpath functions. Profiling has shown this to + be a time critical function. """ data_dict = {} - index_list = [None] # put in list so we can modify using parse_line - def process_line(line): - """Process given line, and modify data_dict or index_list""" - if not line: return # skip empty lines - match = line_parsing_regexp.search(line) - if not match: raise ParsingError("Bad line: '%s'" % line) - field, data = match.group(1), match.group(2) - + for field, data in line_parsing_regexp.findall(record_string): if field == "File": - if data == ".": index_list[0] = () - else: index_list[0] = tuple(unquote_path(data).split("/")) + if data == ".": index = () + else: index = tuple(unquote_path(data).split("/")) elif field == "Type": if data == "None": data_dict['type'] = None else: data_dict['type'] = data @@ -140,9 +134,7 @@ def Record2RORP(record_string): elif field == "Gid": data_dict['gid'] = int(data) elif field == "Permissions": data_dict['perms'] = int(data) else: raise ParsingError("Unknown field in line '%s'" % line) - - map(process_line, record_string.split("\n")) - return rpath.RORPath(index_list[0], data_dict) + return rpath.RORPath(index, data_dict) chars_to_quote = re.compile("\\n|\\\\") def quote_path(path_string): @@ -260,6 +252,7 @@ class rorp_extractor: metadata_rp = None metadata_fileobj = None +metadata_record_buffer = [] # Use this because gzip writes are slow def OpenMetadata(rp = None, compress = 1): """Open the Metadata file for writing, return metadata fileobj""" global metadata_rp, metadata_fileobj @@ -274,13 +267,20 @@ def OpenMetadata(rp = None, compress = 1): def WriteMetadata(rorp): """Write metadata of rorp to file""" - global metadata_fileobj - metadata_fileobj.write(RORP2Record(rorp)) + global metadata_fileobj, metadata_record_buffer + metadata_record_buffer.append(RORP2Record(rorp)) + if len(metadata_record_buffer) >= 100: write_metadata_buffer() + +def write_metadata_buffer(): + global metadata_record_buffer + metadata_fileobj.write("".join(metadata_record_buffer)) + metadata_record_buffer = [] def CloseMetadata(): """Close the metadata file""" global metadata_rp, metadata_fileobj assert metadata_fileobj, "Metadata file not open" + if metadata_record_buffer: write_metadata_buffer() try: fileno = metadata_fileobj.fileno() # will not work if GzipFile except AttributeError: fileno = metadata_fileobj.fileobj.fileno() os.fsync(fileno) diff --git a/rdiff-backup/rdiff_backup/rpath.py b/rdiff-backup/rdiff_backup/rpath.py index 909ac0a..f133eb6 100644 --- a/rdiff-backup/rdiff_backup/rpath.py +++ b/rdiff-backup/rdiff_backup/rpath.py @@ -440,8 +440,8 @@ class RORPath: def getnumlinks(self): """Number of places inode is linked to""" - try: return self.data['nlink'] - except KeyError: return 1 + if self.data.has_key('nlink'): return self.data['nlink'] + else: return 1 def readlink(self): """Wrapper around os.readlink()""" diff --git a/rdiff-backup/rdiff_backup/statistics.py b/rdiff-backup/rdiff_backup/statistics.py index 5ed6cd3..c381473 100644 --- a/rdiff-backup/rdiff_backup/statistics.py +++ b/rdiff-backup/rdiff_backup/statistics.py @@ -20,7 +20,7 @@ """Generate and process aggregated backup information""" import re, os, time -import Globals, robust, Time, rorpiter, increment, log +import Globals, Time, increment, log, static class StatsException(Exception): pass @@ -347,6 +347,69 @@ def print_active_stats(): """Print statistics of active statobj to stdout and log""" global _active_statfileobj assert _active_statfileobj + _active_statfileobj.finish() statmsg = _active_statfileobj.get_stats_logstring("Session statistics") log.Log.log_to_file(statmsg) Globals.client_conn.sys.stdout.write(statmsg) + + +class FileStats: + """Keep track of less detailed stats on file-by-file basis""" + _fileobj, _rp = None, None + _line_sep = None + def init(cls): + """Open file stats object and prepare to write""" + assert not (cls._fileobj or cls._rp), (cls._fileobj, cls._rp) + rpbase = Globals.rbdir.append("file_statistics") + suffix = Globals.compression and 'data.gz' or 'data' + cls._rp = increment.get_inc(rpbase, suffix, Time.curtime) + assert not cls._rp.lstat() + cls._fileobj = cls._rp.open("wb", compress = Globals.compression) + + cls._line_sep = Globals.null_separator and '\0' or '\n' + cls.write_docstring() + cls.line_buffer = [] + + def write_docstring(cls): + """Write the first line (a documentation string) into file""" + cls._fileobj.write("# Format of each line in file statistics file:") + cls._fileobj.write(cls._line_sep) + cls._fileobj.write("# Filename Changed SourceSize MirrorSize " + "IncrementSize" + cls._line_sep) + + def update(cls, source_rorp, dest_rorp, changed, inc): + """Update file stats with given information""" + if source_rorp: filename = source_rorp.get_indexpath() + else: filename = dest_rorp.get_indexpath() + + size_list = map(cls.get_size, [source_rorp, dest_rorp, inc]) + line = " ".join([filename, str(changed)] + size_list) + cls.line_buffer.append(line) + if len(cls.line_buffer) >= 100: cls.write_buffer() + + def get_size(cls, rorp): + """Return the size of rorp as string, or "NA" if not a regular file""" + if not rorp: return "NA" + if rorp.isreg(): return str(rorp.getsize()) + else: return "0" + + def write_buffer(cls): + """Write buffer to file because buffer is full + + The buffer part is necessary because the GzipFile.write() + method seems fairly slow. + + """ + assert cls.line_buffer and cls._fileobj + cls.line_buffer.append('') # have join add _line_sep to end also + cls._fileobj.write(cls._line_sep.join(cls.line_buffer)) + cls.line_buffer = [] + + def close(cls): + """Close file stats file""" + assert cls._fileobj, cls._fileobj + if cls.line_buffer: cls.write_buffer() + assert not cls._fileobj.close() + cls._fileobj = cls._rp = None + +static.MakeClass(FileStats) diff --git a/rdiff-backup/testing/metadatatest.py b/rdiff-backup/testing/metadatatest.py index 7b6a91a..ed7e07c 100644 --- a/rdiff-backup/testing/metadatatest.py +++ b/rdiff-backup/testing/metadatatest.py @@ -8,7 +8,7 @@ class MetadataTest(unittest.TestCase): def make_temp(self): """Make temp directory testfiles/output""" global tempdir - tempdir.delete() + if tempdir.lstat(): tempdir.delete() tempdir.mkdir() def testQuote(self): diff --git a/rdiff-backup/testing/statisticstest.py b/rdiff-backup/testing/statisticstest.py index 85a1b68..b198b61 100644 --- a/rdiff-backup/testing/statisticstest.py +++ b/rdiff-backup/testing/statisticstest.py @@ -180,26 +180,6 @@ class IncStatTest(unittest.TestCase): rbdir = rpath.RPath(Globals.local_connection, "testfiles/output/rdiff-backup-data") - #incs = Restore.get_inclist(rbdir.append("subdir"). - # append("directory_statistics")) - #assert len(incs) == 2 - #s1 = StatsObj().read_stats_from_rp(incs[0]) # initial mirror stats - #assert s1.SourceFiles == 2 - #assert 400000 < s1.SourceFileSize < 420000 - #self.stats_check_initial(s1) - - #subdir_stats = StatsObj().read_stats_from_rp(incs[1]) # increment stats - #assert subdir_stats.SourceFiles == 2 - #assert 400000 < subdir_stats.SourceFileSize < 420000 - #assert subdir_stats.MirrorFiles == 2 - #assert 400000 < subdir_stats.MirrorFileSize < 420000 - #assert subdir_stats.NewFiles == subdir_stats.NewFileSize == 0 - #assert subdir_stats.DeletedFiles == subdir_stats.DeletedFileSize == 0 - #assert subdir_stats.ChangedFiles == 2 - #assert 400000 < subdir_stats.ChangedSourceSize < 420000 - #assert 400000 < subdir_stats.ChangedMirrorSize < 420000 - #assert 10 < subdir_stats.IncrementFileSize < 20000 - incs = restore.get_inclist(rbdir.append("session_statistics")) assert len(incs) == 2 s2 = statistics.StatsObj().read_stats_from_rp(incs[0]) @@ -214,7 +194,7 @@ class IncStatTest(unittest.TestCase): assert 700000 <= root_stats.MirrorFileSize < 750000 assert root_stats.NewFiles == 1 assert root_stats.NewFileSize == 0 - assert root_stats.DeletedFiles == 1 + assert root_stats.DeletedFiles == 1, root_stats.DeletedFiles assert root_stats.DeletedFileSize == 200000 assert 3 <= root_stats.ChangedFiles <= 4, root_stats.ChangedFiles assert 450000 <= root_stats.ChangedSourceSize < 470000 diff --git a/rdiff-backup/testing/test_with_profiling.py b/rdiff-backup/testing/test_with_profiling.py new file mode 100644 index 0000000..13aefa3 --- /dev/null +++ b/rdiff-backup/testing/test_with_profiling.py @@ -0,0 +1,7 @@ +import profile, pstats +from metadatatest import * + +profile.run("unittest.main()", "profile-output") +p = pstats.Stats("profile-output") +p.sort_stats('time') +p.print_stats(40) -- cgit v1.2.1