diff options
-rwxr-xr-x | rdiff-backup/rdiff-backup-statistics | 528 |
1 files changed, 375 insertions, 153 deletions
diff --git a/rdiff-backup/rdiff-backup-statistics b/rdiff-backup/rdiff-backup-statistics index 761b0c3..b289fe7 100755 --- a/rdiff-backup/rdiff-backup-statistics +++ b/rdiff-backup/rdiff-backup-statistics @@ -20,65 +20,292 @@ # USA -import os, sys, re -import rdiff_backup.connection, rdiff_backup.regress -import rdiff_backup.rpath as rpath -import rdiff_backup.Globals as Globals -import rdiff_backup.restore as restore - - -tag = None # Set to an rdiff-backup session time - -def check_args(): - global tag - def error(msg): - sys.stderr.write("Command line error: %s\n" % (msg,)) - sys.exit(2) - if not (2 <= len(sys.argv) <= 3): - error("usage: %s mirror-directory [date_tag]" % (sys.argv[0],)) +import profile, pstats +import os, sys, re, getopt +from rdiff_backup import connection, regress, rpath, Globals, restore, Time, lazy + + +begin_time = None # Parse statistics at or after this time... +end_time = None # ... and at or before this time (epoch seconds) +min_ratio = .05 # report only files/directories over this number + +def parse_args(): + global begin_time, end_time, min_ratio + try: optlist, args = getopt.getopt(sys.argv[1:], "", + ["begin-time=", "end-time=", "minimum-ratio="]) + except getopt.error, e: + sys.exit("Bad commandline options: " + str(e)) + + for opt, arg in optlist: + if opt == "--begin-time": begin_time = Time.genstrtotime(arg) + elif opt == "--end-time": end_time = Time.genstrtotime(arg) + elif opt == "--minimum-ratio": min_ratio = float(arg) + else: assert 0 + + if len(args) != 1: + sys.exit("Usage: %s --begin-time <time> --end-time <time> <backup-dir>" + % (sys.argv[0],)) + Globals.rbdir = rpath.RPath(Globals.local_connection, - os.path.join(sys.argv[1], 'rdiff-backup-data')) + os.path.join(args[0], 'rdiff-backup-data')) if not Globals.rbdir.isdir(): - error("Directory %s not found" % (Globals.rbdir.path,)) + sys.exit("Directory %s not found" % (Globals.rbdir.path,)) if len(sys.argv) == 3: tag = sys.argv[2] def system(cmd): if os.system(cmd): sys.exit("Error running command '%s'\n" % (cmd,)) -def get_rbdir_inc_rpath(prefix): - """Get rp in rdiff-backup-data given prefix (either newest or with tag)""" - if tag: - rp1 = Globals.rbdir.append('%s.%s.data' % (prefix, tag)) - if rp1.isreg(): return rp1 - rp2 = Globals.rbdir.append('%s.%s.data.gz' % (prefix, tag)) - if rp2.isreg(): return rp2 - sys.exit(rp.path + " is not a regular file. Bad tag specified?") - else: - rp_base = Globals.rbdir.append(prefix) - inclist = restore.get_inclist(rp_base) - if not inclist: sys.exit("No data files in rdiff-backup-data dir " - "starting with %s were found!" % (prefix,)) - inclist.sort(key = lambda i: i.getinctime()) - return inclist[-1] - -def print_statistics(): - print "\nSession statistics:" - print get_rbdir_inc_rpath('session_statistics').get_data() - print "\nAverage statistics:" - system("rdiff-backup --calculate-average %s/session_statistics.*" % - (Globals.rbdir.path,)) - -def get_open_filestats(): - """Return open file object based on file_statistics""" - file_stats_rp = get_rbdir_inc_rpath('file_statistics') - assert file_stats_rp.isincfile() - fileobj = file_stats_rp.open('r', file_stats_rp.isinccompressed()) - fileobj.readline() - if fileobj.readline() != ("# Filename Changed SourceSize " - "MirrorSize IncrementSize\n"): - sys.stderr.write("Format of %s may be unfamiliar\n" - % (file_stats_rp.path)) - return fileobj + +class StatisticsRPaths: + """Hold file_statistics and session_statistics rpaths""" + def __init__(self, rbdir): + """Initializer - read increment files from rbdir""" + self.rbdir = rbdir + self.session_rps = self.get_sorted_inc_rps('session_statistics') + self.filestat_rps = self.get_sorted_inc_rps('file_statistics') + self.combined_pairs = self.get_combined_pairs() + + def get_sorted_inc_rps(self, prefix): + """Return list of sorted rps with given prefix""" + incs = restore.get_inclist(self.rbdir.append(prefix)) + if begin_time: + incs = filter(lambda i: i.getinctime() >= begin_time, incs) + if end_time: + incs = filter(lambda i: i.getinctime() <= end_time, incs) + incs.sort(key = lambda i: i.getinctime()) + return incs + + def get_combined_pairs(self): + """Return list of matched (session_rp, file_rp) pairs""" + session_dict = {} + for inc in self.session_rps: session_dict[inc.getinctime()] = inc + filestat_dict = {} + for inc in self.filestat_rps: filestat_dict[inc.getinctime()] = inc + + result = [] + keylist = session_dict.keys() + keylist.sort() + for time in keylist: + if filestat_dict.has_key(time): + result.append((session_dict[time], filestat_dict[time])) + else: sys.stderr.write("No file_statistics to match %s\n" % + (session_dict[time].path,)) + return result + +def print_session_statistics(stat_rpaths): + print "Session statistics:" + system("rdiff-backup --calculate-average " + + " ".join([inc.path for inc in stat_rpaths.session_rps])) + + +class FileStatisticsTree: + """Holds a tree of important files/directories, along with cutoffs""" + def __init__(self, cutoff_fs, fs_root): + """Initialize with FileStat cutoff object, and root of tree""" + self.cutoff_fs = cutoff_fs + self.fs_root = fs_root + + def __iadd__(self, other): + """Add cutoffs, and merge the other's fs_root""" + self.cutoff_fs += other.cutoff_fs + self.merge_tree(self.fs_root, other.fs_root) + return self + + def merge_tree(self, myfs, otherfs): + """Add other_fs's tree to one of my fs trees""" + assert myfs.nametuple == otherfs.nametuple + total_children = {} + mine = dict([(child.nametuple, child) for child in myfs.children]) + others = dict([(child.nametuple, child) for child in otherfs.children]) + for name in mine.keys() + others.keys(): # Remove duplicates + if not total_children.has_key(name): + total_children[name] = (mine.get(name), others.get(name)) + + # Subtract subdirectories so we can rebuild + for child in myfs.children: myfs -= child + for child in otherfs.children: otherfs -= child + myfs.children = [] + + for (name, (mychild, otherchild)) in total_children.items(): + if mychild: + if otherchild: self.merge_tree(mychild, otherchild) + myfs += mychild + myfs.children.append(mychild) + elif otherchild: + myfs += otherchild + myfs.children.append(otherchild) + else: assert 0 + myfs += otherfs + + def get_top_fs(self, fs_func): + """Process the FileStat tree and find everything above the cutoff + + fs_func will be used to evaluate cutoff_fs and those in the + tree. Of course the root will be above the cutoff, but we try + to find the most specific directories still above the cutoff. + The value of any directories that make the cutoff will be + excluded from the value of parent directories. + + """ + abs_cutoff = fs_func(self.cutoff_fs) + def helper(subtree): + """Returns ([list of (top fs, value)], total excluded amount)""" + subtree_val = fs_func(subtree) + if subtree_val <= abs_cutoff: return ([], 0) + + top_children, total_excluded = [], 0 + for child in subtree.children: + top_sublist, excluded = helper(child) + top_children.extend(top_sublist) + total_excluded += excluded + + current_value = subtree_val - total_excluded + if current_value >= abs_cutoff: + return ([(subtree, current_value)] + top_children, subtree_val) + else: return (top_children, total_excluded) + return helper(self.fs_root)[0] + + def print_top_dirs(self, label, fs_func): + """Print the top directories in sorted order""" + def print_line(fs, val): + percentage = float(val)/fs_func(self.fs_root) * 100 + path = fs.nametuple and '/'.join(fs.nametuple) or '.' + print '%s (%02.1f%%)' % (path, percentage) + + s = "Top directories by %s (percent of total)" % (label,) + print "\n%s\n%s" % (s, ('-'*len(s))) + top_fs_pair_list = self.get_top_fs(fs_func) + top_fs_pair_list.sort(key = lambda pair: pair[1], reverse = 1) + for fs, val in top_fs_pair_list: print_line(fs, val) + +def make_fst(session_rp, filestat_rp): + """Construct FileStatisticsTree given session and file stat rps + + We would like a full tree, but this in general will take too much + memory. Instead we will build a tree that has only the + files/directories with some stat exceeding the min ratio. + + """ + def get_ss_dict(): + """Parse session statistics file and return dictionary with ss data""" + fileobj = session_rp.open('r', session_rp.isinccompressed()) + return_val = {} + for line in fileobj: + if line.startswith('#'): continue + comps = line.split() + if len(comps) < 2: + sys.stderr.write("Unable to parse session statistics line: " + +line) + continue + return_val[comps[0]] = float(comps[1]) + return return_val + + def get_cutoff_fs(session_dict): + """Return FileStat object set with absolute cutoffs + + Any FileStat object that is bigger than the result in any + aspect will be considered "important". + + """ + def get_min(attrib): return min_ratio*session_dict[attrib] + min_changed = min_ratio*(session_dict['NewFiles'] + + session_dict['ChangedFiles'] + session_dict['NewFiles']) + return FileStat((), min_changed, get_min('SourceFileSize'), + get_min('IncrementFileSize')) + + def yield_fs_objs(filestatsobj): + """Iterate FileStats by opening file_statistics fileobj""" + r = re.compile("^(.*) ([0-9]+) ([0-9]+|NA) ([0-9]+|NA) " + "([0-9]+|NA)\n?$") + for line in filestatsobj: + if line.startswith('#'): continue + match = r.match(line) + if not match: + sys.stderr.write("Error parsing line: %s\n" % (line,)) + continue + + filename = match.group(1) + if filename == '.': nametuple = () + else: nametuple = tuple(filename.split('/')) + + sourcesize_str = match.group(3) + if sourcesize_str == 'NA': sourcesize = 0 + else: sourcesize = int(sourcesize_str) + + incsize_str = match.group(5) + if incsize_str == 'NA': incsize = 0 + else: incsize = int(incsize_str) + + yield FileStat(nametuple, int(match.group(2)), sourcesize, incsize) + + def accumulate_fs(fs_iter): + """Yield the FileStat objects in fs_iter, but with total statistics + + In fs_iter, the statistics of directories FileStats only apply + to themselves. This will iterate the same FileStats, but + directories will include all the files under them. As a + result, the directories will come after the files in them + (e.g. '.' will be last.). + + Naturally this would be written recursively, but profiler said + it was too slow in python. + + """ + root = fs_iter.next() + assert root.nametuple == (), root + stack = [root] + try: fs = fs_iter.next() + except StopIteration: + yield root + return + + while 1: + if fs and fs.is_child(stack[-1]): + stack.append(fs) + try: fs = fs_iter.next() + except StopIteration: fs = None + else: + expired = stack.pop() + yield expired + if not stack: return + else: stack[-1].add_child(expired) + + + def make_tree_one_level(fs_iter, first_fs): + """Populate a tree of FileStat objects from fs_iter + + This function wants the fs_iter in the reverse direction as + usual, with the parent coming directly after all the children. + It will return the parent of first_fs. + + """ + children = [first_fs] + fs = fs_iter.next() + while 1: + if first_fs.is_child(fs): + fs.children = children + return fs + elif first_fs.is_brother(fs): + children.append(fs) + fs = fs_iter.next() + else: fs = make_tree_one_level(fs_iter, fs) + + def make_root_tree(fs_iter): + """Like make_tree, but assume fs_iter starts at the root""" + try: fs = fs_iter.next() + except StopIteration: sys.exit("No files in iterator") + + while fs.nametuple != (): fs = make_tree_one_level(fs_iter, fs) + return fs + + cutoff_fs = get_cutoff_fs(get_ss_dict()) + filestat_fileobj = ReadlineBuffer(filestat_rp) + accumulated_iter = accumulate_fs(yield_fs_objs(filestat_fileobj)) + important_iter = lazy.Iter.filter(lambda fs: fs >= cutoff_fs, + accumulated_iter) + trimmed_tree = make_root_tree(important_iter) + return FileStatisticsTree(cutoff_fs, trimmed_tree) + class FileStat: """Hold the information in one line of file_statistics @@ -93,116 +320,111 @@ class FileStat: self.sourcesize, self.incsize = sourcesize, incsize self.children = [] - def add_child(self, child): - self.children.append(child) - self.changed += child.changed - self.sourcesize += child.sourcesize - self.incsize += child.incsize + def add_child(self, child): self += child + + def is_subdir(self, parent): + """Return True if self is an eventual subdir of parent""" + return self.nametuple[:len(parent.nametuple)] == parent.nametuple + + def is_child(self, parent): + """Return True if self is an immediate child of parent""" + return self.nametuple and self.nametuple[:-1] == parent.nametuple + + def is_brother(self, brother): + """Return True if self is in same directory as brother""" + if not self.nametuple or not brother.nametuple: return 0 + return self.nametuple[:-1] == brother.nametuple[:-1] def __str__(self): return "%s %s %s %s" % (self.nametuple, self.changed, self.sourcesize, self.incsize) -def yield_fs_objs(filestatsobj): - """Iterate FileStats from open file_statistics fileobj""" - r = re.compile("^(.*) ([0-9]+) ([0-9]+|NA) ([0-9]+|NA) ([0-9]+|NA)\n?$") - while 1: - line = filestatsobj.readline() - if not line: break - if line.startswith('#'): continue - - match = r.match(line) - if not match: - print "Error parsing line: ", line - continue - - filename = match.group(1) - if filename == '.': nametuple = () - else: nametuple = tuple(filename.split('/')) - if match.group(3) == 'NA': sourcesize = 0 - else: sourcesize = int(match.group(3)) - if match.group(5) == 'NA': incsize = 0 - else: incsize = int(match.group(5)) - - yield FileStat(nametuple, int(match.group(2)), sourcesize, incsize) - -def make_tree(fs_iter, root_fs): - """Populate a tree of FileStat objects from fs_iter - - We require that the nametuple of every FileStat put into the tree - starts with the same nametuple as root_fs. Return value will be a - tuple (root fs object, overflow), where overflow is the next - FileStat object in the iterator, or None. + def __eq__(self, other): + return (self.changed == other.changed and + self.sourcesize == other.sourcesize and + self.incsize == other.incsize) + + def __ge__(self, other): + """Note the 'or' -- this relation is not a well ordering""" + return (self.changed >= other.changed or + self.sourcesize >= other.sourcesize or + self.incsize >= other.incsize) + + def __iadd__(self, other): + """Add values of other to self""" + self.changed += other.changed + self.sourcesize += other.sourcesize + self.incsize += other.incsize + return self + + def __isub__(self, other): + """Subtract values of other from self""" + self.changed -= other.changed + self.sourcesize -= other.sourcesize + self.incsize -= other.incsize + return self - """ - try: fs = fs_iter.next() - except StopIteration: return (root_fs, None) - - while 1: - if fs.nametuple[:len(root_fs.nametuple)] != root_fs.nametuple: - return (root_fs, fs) - subtree, fs = make_tree(fs_iter, fs) - root_fs.add_child(subtree) - if not fs: return (root_fs, None) - -def make_root_tree(fs_iter): - """Like make_tree, but assume fs_iter starts at the root""" - try: root_fs = fs_iter.next() - except StopIteration: sys.exit("No files in iterator") - assert root_fs.nametuple == (), root_fs - tree, overflow = make_tree(fs_iter, root_fs) - assert overflow is None, overflow - return tree - -def get_top_fs(fs_tree, cutoff = .05, fs_func = lambda fs: fs.incsize): - """Process the FileStat tree and find everything above the cutoff - - cutoff is a fraction of the root. Of course the root will be - above the cutoff, but we try to find the most specific directories - still above the cutoff. The value of any directories that make - the cutoff will be excluded from the value of parent directories. + +class ReadlineBuffer: + """Iterate lines like a normal filelike obj + + Use this because gzip doesn't provide any buffering, so readline() + is very slow. """ - abs_cutoff = cutoff*fs_func(fs_tree) - def helper(subtree): - """Returns ([list of (top fs, value)], total excluded amount)""" - subtree_val = fs_func(subtree) - if subtree_val <= abs_cutoff: return ([], 0) - - top_children, total_excluded = [], 0 - for child in subtree.children: - top_sublist, excluded = helper(child) - top_children.extend(top_sublist) - total_excluded += excluded - - current_value = subtree_val - total_excluded - if current_value >= abs_cutoff: - return ([(subtree, current_value)] + top_children, subtree_val) - else: return (top_children, total_excluded) - return helper(fs_tree)[0] - -def print_top_dirs(fs_tree, label, fs_func): - """Print the top directories in sorted order""" - def print_line(fs, val): - percentage = float(val)/fs_func(fs_tree) * 100 - path = fs.nametuple and '/'.join(fs.nametuple) or '.' - print '%s (%02.1f%%)' % (path, percentage) - - s = "Top directories by %s (percent of total)" % (label,) - print s + '\n' + ('-'*len(s)) - top_fs_pair_list = get_top_fs(fs_tree, .05, fs_func) - top_fs_pair_list.sort(key = lambda pair: pair[1], reverse = 1) - for fs, val in top_fs_pair_list: print_line(fs, val) + blocksize = 65536 + separator = '\n' + + def __init__(self, rp): + """Initialize with rpath""" + self.buffer = [''] + self.at_end = 0 + + if rp.isincfile(): + self.fileobj = rp.open('r', rp.isinccompressed()) + else: self.fileobj = rp.open('r') + + def __iter__(self): + """Yield the lines in self.fileobj""" + while self.buffer or not self.at_end: + if len(self.buffer) > 1: yield self.buffer.pop(0) + elif not self.at_end: self.addtobuffer() + else: + last = self.buffer.pop() + if last: yield last + + def addtobuffer(self): + """Read next block from fileobj, split and add to bufferlist""" + block = self.fileobj.read(self.blocksize) + if block: + split = block.split(self.separator) + self.buffer[0] += split[0] + self.buffer.extend(split[1:]) + else: self.at_end = 1 + +def sum_fst(rp_pairs): + """Add the file statistics given as list of (session_rp, file_rp) pairs""" + n = len(rp_pairs) + print "Processing statistics from session 1 of %d" % (n,) + total_fst = make_fst(*rp_pairs[0]) + for i in range(1, n): + print "Processing statistics from session %d of %d" % (i+1, n) + session_rp, filestat_rp = rp_pairs[i] + fst = make_fst(session_rp, filestat_rp) + total_fst += fst + return total_fst def Main(): - check_args() - print_statistics() - fs_tree = make_root_tree(yield_fs_objs(get_open_filestats())) - print_top_dirs(fs_tree, "source size", lambda fs: fs.sourcesize) - print - print_top_dirs(fs_tree, "increment size", lambda fs: fs.incsize) - print - print_top_dirs(fs_tree, "number of files changed", - lambda fs: fs.changed) + parse_args() + srp = StatisticsRPaths(Globals.rbdir) + if not srp.combined_pairs: sys.exit("No matching sessions found") + if len(srp.combined_pairs) == 1: fst = make_fst(*srp.combined_pairs[0]) + else: fst = sum_fst(srp.combined_pairs) + + print_session_statistics(srp) + fst.print_top_dirs("source size", lambda fs: fs.sourcesize) + fst.print_top_dirs("increment size", lambda fs: fs.incsize) + fst.print_top_dirs("number of files changed", lambda fs: fs.changed) if __name__ == '__main__': Main() + |