summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbescoto <bescoto@2b77aa54-bcbc-44c9-a7ec-4f6cf2b41109>2005-12-22 21:11:52 +0000
committerbescoto <bescoto@2b77aa54-bcbc-44c9-a7ec-4f6cf2b41109>2005-12-22 21:11:52 +0000
commitce14d9db6ba93f820b9b1ff7df733bd57420f510 (patch)
tree69b1dc8f8cb18f56d55e0829e1bab3404d498327
parent638b4f9a8f256079c9e32c32ea800939069061a1 (diff)
downloadrdiff-backup-ce14d9db6ba93f820b9b1ff7df733bd57420f510.tar.gz
uses less memory, can merge multiple increments
git-svn-id: http://svn.savannah.nongnu.org/svn/rdiff-backup/trunk@717 2b77aa54-bcbc-44c9-a7ec-4f6cf2b41109
-rwxr-xr-xrdiff-backup/rdiff-backup-statistics528
1 files changed, 375 insertions, 153 deletions
diff --git a/rdiff-backup/rdiff-backup-statistics b/rdiff-backup/rdiff-backup-statistics
index 761b0c3..b289fe7 100755
--- a/rdiff-backup/rdiff-backup-statistics
+++ b/rdiff-backup/rdiff-backup-statistics
@@ -20,65 +20,292 @@
# USA
-import os, sys, re
-import rdiff_backup.connection, rdiff_backup.regress
-import rdiff_backup.rpath as rpath
-import rdiff_backup.Globals as Globals
-import rdiff_backup.restore as restore
-
-
-tag = None # Set to an rdiff-backup session time
-
-def check_args():
- global tag
- def error(msg):
- sys.stderr.write("Command line error: %s\n" % (msg,))
- sys.exit(2)
- if not (2 <= len(sys.argv) <= 3):
- error("usage: %s mirror-directory [date_tag]" % (sys.argv[0],))
+import profile, pstats
+import os, sys, re, getopt
+from rdiff_backup import connection, regress, rpath, Globals, restore, Time, lazy
+
+
+begin_time = None # Parse statistics at or after this time...
+end_time = None # ... and at or before this time (epoch seconds)
+min_ratio = .05 # report only files/directories over this number
+
+def parse_args():
+ global begin_time, end_time, min_ratio
+ try: optlist, args = getopt.getopt(sys.argv[1:], "",
+ ["begin-time=", "end-time=", "minimum-ratio="])
+ except getopt.error, e:
+ sys.exit("Bad commandline options: " + str(e))
+
+ for opt, arg in optlist:
+ if opt == "--begin-time": begin_time = Time.genstrtotime(arg)
+ elif opt == "--end-time": end_time = Time.genstrtotime(arg)
+ elif opt == "--minimum-ratio": min_ratio = float(arg)
+ else: assert 0
+
+ if len(args) != 1:
+ sys.exit("Usage: %s --begin-time <time> --end-time <time> <backup-dir>"
+ % (sys.argv[0],))
+
Globals.rbdir = rpath.RPath(Globals.local_connection,
- os.path.join(sys.argv[1], 'rdiff-backup-data'))
+ os.path.join(args[0], 'rdiff-backup-data'))
if not Globals.rbdir.isdir():
- error("Directory %s not found" % (Globals.rbdir.path,))
+ sys.exit("Directory %s not found" % (Globals.rbdir.path,))
if len(sys.argv) == 3: tag = sys.argv[2]
def system(cmd):
if os.system(cmd): sys.exit("Error running command '%s'\n" % (cmd,))
-def get_rbdir_inc_rpath(prefix):
- """Get rp in rdiff-backup-data given prefix (either newest or with tag)"""
- if tag:
- rp1 = Globals.rbdir.append('%s.%s.data' % (prefix, tag))
- if rp1.isreg(): return rp1
- rp2 = Globals.rbdir.append('%s.%s.data.gz' % (prefix, tag))
- if rp2.isreg(): return rp2
- sys.exit(rp.path + " is not a regular file. Bad tag specified?")
- else:
- rp_base = Globals.rbdir.append(prefix)
- inclist = restore.get_inclist(rp_base)
- if not inclist: sys.exit("No data files in rdiff-backup-data dir "
- "starting with %s were found!" % (prefix,))
- inclist.sort(key = lambda i: i.getinctime())
- return inclist[-1]
-
-def print_statistics():
- print "\nSession statistics:"
- print get_rbdir_inc_rpath('session_statistics').get_data()
- print "\nAverage statistics:"
- system("rdiff-backup --calculate-average %s/session_statistics.*" %
- (Globals.rbdir.path,))
-
-def get_open_filestats():
- """Return open file object based on file_statistics"""
- file_stats_rp = get_rbdir_inc_rpath('file_statistics')
- assert file_stats_rp.isincfile()
- fileobj = file_stats_rp.open('r', file_stats_rp.isinccompressed())
- fileobj.readline()
- if fileobj.readline() != ("# Filename Changed SourceSize "
- "MirrorSize IncrementSize\n"):
- sys.stderr.write("Format of %s may be unfamiliar\n"
- % (file_stats_rp.path))
- return fileobj
+
+class StatisticsRPaths:
+ """Hold file_statistics and session_statistics rpaths"""
+ def __init__(self, rbdir):
+ """Initializer - read increment files from rbdir"""
+ self.rbdir = rbdir
+ self.session_rps = self.get_sorted_inc_rps('session_statistics')
+ self.filestat_rps = self.get_sorted_inc_rps('file_statistics')
+ self.combined_pairs = self.get_combined_pairs()
+
+ def get_sorted_inc_rps(self, prefix):
+ """Return list of sorted rps with given prefix"""
+ incs = restore.get_inclist(self.rbdir.append(prefix))
+ if begin_time:
+ incs = filter(lambda i: i.getinctime() >= begin_time, incs)
+ if end_time:
+ incs = filter(lambda i: i.getinctime() <= end_time, incs)
+ incs.sort(key = lambda i: i.getinctime())
+ return incs
+
+ def get_combined_pairs(self):
+ """Return list of matched (session_rp, file_rp) pairs"""
+ session_dict = {}
+ for inc in self.session_rps: session_dict[inc.getinctime()] = inc
+ filestat_dict = {}
+ for inc in self.filestat_rps: filestat_dict[inc.getinctime()] = inc
+
+ result = []
+ keylist = session_dict.keys()
+ keylist.sort()
+ for time in keylist:
+ if filestat_dict.has_key(time):
+ result.append((session_dict[time], filestat_dict[time]))
+ else: sys.stderr.write("No file_statistics to match %s\n" %
+ (session_dict[time].path,))
+ return result
+
+def print_session_statistics(stat_rpaths):
+ print "Session statistics:"
+ system("rdiff-backup --calculate-average " +
+ " ".join([inc.path for inc in stat_rpaths.session_rps]))
+
+
+class FileStatisticsTree:
+ """Holds a tree of important files/directories, along with cutoffs"""
+ def __init__(self, cutoff_fs, fs_root):
+ """Initialize with FileStat cutoff object, and root of tree"""
+ self.cutoff_fs = cutoff_fs
+ self.fs_root = fs_root
+
+ def __iadd__(self, other):
+ """Add cutoffs, and merge the other's fs_root"""
+ self.cutoff_fs += other.cutoff_fs
+ self.merge_tree(self.fs_root, other.fs_root)
+ return self
+
+ def merge_tree(self, myfs, otherfs):
+ """Add other_fs's tree to one of my fs trees"""
+ assert myfs.nametuple == otherfs.nametuple
+ total_children = {}
+ mine = dict([(child.nametuple, child) for child in myfs.children])
+ others = dict([(child.nametuple, child) for child in otherfs.children])
+ for name in mine.keys() + others.keys(): # Remove duplicates
+ if not total_children.has_key(name):
+ total_children[name] = (mine.get(name), others.get(name))
+
+ # Subtract subdirectories so we can rebuild
+ for child in myfs.children: myfs -= child
+ for child in otherfs.children: otherfs -= child
+ myfs.children = []
+
+ for (name, (mychild, otherchild)) in total_children.items():
+ if mychild:
+ if otherchild: self.merge_tree(mychild, otherchild)
+ myfs += mychild
+ myfs.children.append(mychild)
+ elif otherchild:
+ myfs += otherchild
+ myfs.children.append(otherchild)
+ else: assert 0
+ myfs += otherfs
+
+ def get_top_fs(self, fs_func):
+ """Process the FileStat tree and find everything above the cutoff
+
+ fs_func will be used to evaluate cutoff_fs and those in the
+ tree. Of course the root will be above the cutoff, but we try
+ to find the most specific directories still above the cutoff.
+ The value of any directories that make the cutoff will be
+ excluded from the value of parent directories.
+
+ """
+ abs_cutoff = fs_func(self.cutoff_fs)
+ def helper(subtree):
+ """Returns ([list of (top fs, value)], total excluded amount)"""
+ subtree_val = fs_func(subtree)
+ if subtree_val <= abs_cutoff: return ([], 0)
+
+ top_children, total_excluded = [], 0
+ for child in subtree.children:
+ top_sublist, excluded = helper(child)
+ top_children.extend(top_sublist)
+ total_excluded += excluded
+
+ current_value = subtree_val - total_excluded
+ if current_value >= abs_cutoff:
+ return ([(subtree, current_value)] + top_children, subtree_val)
+ else: return (top_children, total_excluded)
+ return helper(self.fs_root)[0]
+
+ def print_top_dirs(self, label, fs_func):
+ """Print the top directories in sorted order"""
+ def print_line(fs, val):
+ percentage = float(val)/fs_func(self.fs_root) * 100
+ path = fs.nametuple and '/'.join(fs.nametuple) or '.'
+ print '%s (%02.1f%%)' % (path, percentage)
+
+ s = "Top directories by %s (percent of total)" % (label,)
+ print "\n%s\n%s" % (s, ('-'*len(s)))
+ top_fs_pair_list = self.get_top_fs(fs_func)
+ top_fs_pair_list.sort(key = lambda pair: pair[1], reverse = 1)
+ for fs, val in top_fs_pair_list: print_line(fs, val)
+
+def make_fst(session_rp, filestat_rp):
+ """Construct FileStatisticsTree given session and file stat rps
+
+ We would like a full tree, but this in general will take too much
+ memory. Instead we will build a tree that has only the
+ files/directories with some stat exceeding the min ratio.
+
+ """
+ def get_ss_dict():
+ """Parse session statistics file and return dictionary with ss data"""
+ fileobj = session_rp.open('r', session_rp.isinccompressed())
+ return_val = {}
+ for line in fileobj:
+ if line.startswith('#'): continue
+ comps = line.split()
+ if len(comps) < 2:
+ sys.stderr.write("Unable to parse session statistics line: "
+ +line)
+ continue
+ return_val[comps[0]] = float(comps[1])
+ return return_val
+
+ def get_cutoff_fs(session_dict):
+ """Return FileStat object set with absolute cutoffs
+
+ Any FileStat object that is bigger than the result in any
+ aspect will be considered "important".
+
+ """
+ def get_min(attrib): return min_ratio*session_dict[attrib]
+ min_changed = min_ratio*(session_dict['NewFiles'] +
+ session_dict['ChangedFiles'] + session_dict['NewFiles'])
+ return FileStat((), min_changed, get_min('SourceFileSize'),
+ get_min('IncrementFileSize'))
+
+ def yield_fs_objs(filestatsobj):
+ """Iterate FileStats by opening file_statistics fileobj"""
+ r = re.compile("^(.*) ([0-9]+) ([0-9]+|NA) ([0-9]+|NA) "
+ "([0-9]+|NA)\n?$")
+ for line in filestatsobj:
+ if line.startswith('#'): continue
+ match = r.match(line)
+ if not match:
+ sys.stderr.write("Error parsing line: %s\n" % (line,))
+ continue
+
+ filename = match.group(1)
+ if filename == '.': nametuple = ()
+ else: nametuple = tuple(filename.split('/'))
+
+ sourcesize_str = match.group(3)
+ if sourcesize_str == 'NA': sourcesize = 0
+ else: sourcesize = int(sourcesize_str)
+
+ incsize_str = match.group(5)
+ if incsize_str == 'NA': incsize = 0
+ else: incsize = int(incsize_str)
+
+ yield FileStat(nametuple, int(match.group(2)), sourcesize, incsize)
+
+ def accumulate_fs(fs_iter):
+ """Yield the FileStat objects in fs_iter, but with total statistics
+
+ In fs_iter, the statistics of directories FileStats only apply
+ to themselves. This will iterate the same FileStats, but
+ directories will include all the files under them. As a
+ result, the directories will come after the files in them
+ (e.g. '.' will be last.).
+
+ Naturally this would be written recursively, but profiler said
+ it was too slow in python.
+
+ """
+ root = fs_iter.next()
+ assert root.nametuple == (), root
+ stack = [root]
+ try: fs = fs_iter.next()
+ except StopIteration:
+ yield root
+ return
+
+ while 1:
+ if fs and fs.is_child(stack[-1]):
+ stack.append(fs)
+ try: fs = fs_iter.next()
+ except StopIteration: fs = None
+ else:
+ expired = stack.pop()
+ yield expired
+ if not stack: return
+ else: stack[-1].add_child(expired)
+
+
+ def make_tree_one_level(fs_iter, first_fs):
+ """Populate a tree of FileStat objects from fs_iter
+
+ This function wants the fs_iter in the reverse direction as
+ usual, with the parent coming directly after all the children.
+ It will return the parent of first_fs.
+
+ """
+ children = [first_fs]
+ fs = fs_iter.next()
+ while 1:
+ if first_fs.is_child(fs):
+ fs.children = children
+ return fs
+ elif first_fs.is_brother(fs):
+ children.append(fs)
+ fs = fs_iter.next()
+ else: fs = make_tree_one_level(fs_iter, fs)
+
+ def make_root_tree(fs_iter):
+ """Like make_tree, but assume fs_iter starts at the root"""
+ try: fs = fs_iter.next()
+ except StopIteration: sys.exit("No files in iterator")
+
+ while fs.nametuple != (): fs = make_tree_one_level(fs_iter, fs)
+ return fs
+
+ cutoff_fs = get_cutoff_fs(get_ss_dict())
+ filestat_fileobj = ReadlineBuffer(filestat_rp)
+ accumulated_iter = accumulate_fs(yield_fs_objs(filestat_fileobj))
+ important_iter = lazy.Iter.filter(lambda fs: fs >= cutoff_fs,
+ accumulated_iter)
+ trimmed_tree = make_root_tree(important_iter)
+ return FileStatisticsTree(cutoff_fs, trimmed_tree)
+
class FileStat:
"""Hold the information in one line of file_statistics
@@ -93,116 +320,111 @@ class FileStat:
self.sourcesize, self.incsize = sourcesize, incsize
self.children = []
- def add_child(self, child):
- self.children.append(child)
- self.changed += child.changed
- self.sourcesize += child.sourcesize
- self.incsize += child.incsize
+ def add_child(self, child): self += child
+
+ def is_subdir(self, parent):
+ """Return True if self is an eventual subdir of parent"""
+ return self.nametuple[:len(parent.nametuple)] == parent.nametuple
+
+ def is_child(self, parent):
+ """Return True if self is an immediate child of parent"""
+ return self.nametuple and self.nametuple[:-1] == parent.nametuple
+
+ def is_brother(self, brother):
+ """Return True if self is in same directory as brother"""
+ if not self.nametuple or not brother.nametuple: return 0
+ return self.nametuple[:-1] == brother.nametuple[:-1]
def __str__(self):
return "%s %s %s %s" % (self.nametuple, self.changed,
self.sourcesize, self.incsize)
-def yield_fs_objs(filestatsobj):
- """Iterate FileStats from open file_statistics fileobj"""
- r = re.compile("^(.*) ([0-9]+) ([0-9]+|NA) ([0-9]+|NA) ([0-9]+|NA)\n?$")
- while 1:
- line = filestatsobj.readline()
- if not line: break
- if line.startswith('#'): continue
-
- match = r.match(line)
- if not match:
- print "Error parsing line: ", line
- continue
-
- filename = match.group(1)
- if filename == '.': nametuple = ()
- else: nametuple = tuple(filename.split('/'))
- if match.group(3) == 'NA': sourcesize = 0
- else: sourcesize = int(match.group(3))
- if match.group(5) == 'NA': incsize = 0
- else: incsize = int(match.group(5))
-
- yield FileStat(nametuple, int(match.group(2)), sourcesize, incsize)
-
-def make_tree(fs_iter, root_fs):
- """Populate a tree of FileStat objects from fs_iter
-
- We require that the nametuple of every FileStat put into the tree
- starts with the same nametuple as root_fs. Return value will be a
- tuple (root fs object, overflow), where overflow is the next
- FileStat object in the iterator, or None.
+ def __eq__(self, other):
+ return (self.changed == other.changed and
+ self.sourcesize == other.sourcesize and
+ self.incsize == other.incsize)
+
+ def __ge__(self, other):
+ """Note the 'or' -- this relation is not a well ordering"""
+ return (self.changed >= other.changed or
+ self.sourcesize >= other.sourcesize or
+ self.incsize >= other.incsize)
+
+ def __iadd__(self, other):
+ """Add values of other to self"""
+ self.changed += other.changed
+ self.sourcesize += other.sourcesize
+ self.incsize += other.incsize
+ return self
+
+ def __isub__(self, other):
+ """Subtract values of other from self"""
+ self.changed -= other.changed
+ self.sourcesize -= other.sourcesize
+ self.incsize -= other.incsize
+ return self
- """
- try: fs = fs_iter.next()
- except StopIteration: return (root_fs, None)
-
- while 1:
- if fs.nametuple[:len(root_fs.nametuple)] != root_fs.nametuple:
- return (root_fs, fs)
- subtree, fs = make_tree(fs_iter, fs)
- root_fs.add_child(subtree)
- if not fs: return (root_fs, None)
-
-def make_root_tree(fs_iter):
- """Like make_tree, but assume fs_iter starts at the root"""
- try: root_fs = fs_iter.next()
- except StopIteration: sys.exit("No files in iterator")
- assert root_fs.nametuple == (), root_fs
- tree, overflow = make_tree(fs_iter, root_fs)
- assert overflow is None, overflow
- return tree
-
-def get_top_fs(fs_tree, cutoff = .05, fs_func = lambda fs: fs.incsize):
- """Process the FileStat tree and find everything above the cutoff
-
- cutoff is a fraction of the root. Of course the root will be
- above the cutoff, but we try to find the most specific directories
- still above the cutoff. The value of any directories that make
- the cutoff will be excluded from the value of parent directories.
+
+class ReadlineBuffer:
+ """Iterate lines like a normal filelike obj
+
+ Use this because gzip doesn't provide any buffering, so readline()
+ is very slow.
"""
- abs_cutoff = cutoff*fs_func(fs_tree)
- def helper(subtree):
- """Returns ([list of (top fs, value)], total excluded amount)"""
- subtree_val = fs_func(subtree)
- if subtree_val <= abs_cutoff: return ([], 0)
-
- top_children, total_excluded = [], 0
- for child in subtree.children:
- top_sublist, excluded = helper(child)
- top_children.extend(top_sublist)
- total_excluded += excluded
-
- current_value = subtree_val - total_excluded
- if current_value >= abs_cutoff:
- return ([(subtree, current_value)] + top_children, subtree_val)
- else: return (top_children, total_excluded)
- return helper(fs_tree)[0]
-
-def print_top_dirs(fs_tree, label, fs_func):
- """Print the top directories in sorted order"""
- def print_line(fs, val):
- percentage = float(val)/fs_func(fs_tree) * 100
- path = fs.nametuple and '/'.join(fs.nametuple) or '.'
- print '%s (%02.1f%%)' % (path, percentage)
-
- s = "Top directories by %s (percent of total)" % (label,)
- print s + '\n' + ('-'*len(s))
- top_fs_pair_list = get_top_fs(fs_tree, .05, fs_func)
- top_fs_pair_list.sort(key = lambda pair: pair[1], reverse = 1)
- for fs, val in top_fs_pair_list: print_line(fs, val)
+ blocksize = 65536
+ separator = '\n'
+
+ def __init__(self, rp):
+ """Initialize with rpath"""
+ self.buffer = ['']
+ self.at_end = 0
+
+ if rp.isincfile():
+ self.fileobj = rp.open('r', rp.isinccompressed())
+ else: self.fileobj = rp.open('r')
+
+ def __iter__(self):
+ """Yield the lines in self.fileobj"""
+ while self.buffer or not self.at_end:
+ if len(self.buffer) > 1: yield self.buffer.pop(0)
+ elif not self.at_end: self.addtobuffer()
+ else:
+ last = self.buffer.pop()
+ if last: yield last
+
+ def addtobuffer(self):
+ """Read next block from fileobj, split and add to bufferlist"""
+ block = self.fileobj.read(self.blocksize)
+ if block:
+ split = block.split(self.separator)
+ self.buffer[0] += split[0]
+ self.buffer.extend(split[1:])
+ else: self.at_end = 1
+
+def sum_fst(rp_pairs):
+ """Add the file statistics given as list of (session_rp, file_rp) pairs"""
+ n = len(rp_pairs)
+ print "Processing statistics from session 1 of %d" % (n,)
+ total_fst = make_fst(*rp_pairs[0])
+ for i in range(1, n):
+ print "Processing statistics from session %d of %d" % (i+1, n)
+ session_rp, filestat_rp = rp_pairs[i]
+ fst = make_fst(session_rp, filestat_rp)
+ total_fst += fst
+ return total_fst
def Main():
- check_args()
- print_statistics()
- fs_tree = make_root_tree(yield_fs_objs(get_open_filestats()))
- print_top_dirs(fs_tree, "source size", lambda fs: fs.sourcesize)
- print
- print_top_dirs(fs_tree, "increment size", lambda fs: fs.incsize)
- print
- print_top_dirs(fs_tree, "number of files changed",
- lambda fs: fs.changed)
+ parse_args()
+ srp = StatisticsRPaths(Globals.rbdir)
+ if not srp.combined_pairs: sys.exit("No matching sessions found")
+ if len(srp.combined_pairs) == 1: fst = make_fst(*srp.combined_pairs[0])
+ else: fst = sum_fst(srp.combined_pairs)
+
+ print_session_statistics(srp)
+ fst.print_top_dirs("source size", lambda fs: fs.sourcesize)
+ fst.print_top_dirs("increment size", lambda fs: fs.incsize)
+ fst.print_top_dirs("number of files changed", lambda fs: fs.changed)
if __name__ == '__main__': Main()
+