diff options
Diffstat (limited to 'bin/swift-recon')
-rwxr-xr-x | bin/swift-recon | 436 |
1 files changed, 350 insertions, 86 deletions
diff --git a/bin/swift-recon b/bin/swift-recon index bbfbc1ad0..48b00f02a 100755 --- a/bin/swift-recon +++ b/bin/swift-recon @@ -12,9 +12,9 @@ try: except ImportError: import json from hashlib import md5 -import datetime import eventlet import optparse +import time import sys import os @@ -26,12 +26,7 @@ class Scout(object): def __init__(self, recon_type, verbose=False, suppress_errors=False, timeout=5): - recon_uri = ["ringmd5", "async", "replication", "load", "diskusage", - "unmounted", "quarantined", "sockstat"] - if recon_type not in recon_uri: - raise Exception("Invalid scout type requested") - else: - self.recon_type = recon_type + self.recon_type = recon_type self.verbose = verbose self.suppress_errors = suppress_errors self.timeout = timeout @@ -87,6 +82,44 @@ class SwiftRecon(object): self.timeout = 5 self.pool_size = 30 self.pool = eventlet.GreenPool(self.pool_size) + self.check_types = ['account', 'container', 'object'] + self.server_type = 'object' + + def _gen_stats(self, stats, name=None): + """ compute various stats from a list of values """ + cstats = [x for x in stats if x is not None] + if len(cstats) > 0: + ret_dict = {'low': min(cstats), 'high': max(cstats), + 'total': sum(cstats), 'reported': len(cstats), + 'number_none': len(stats) - len(cstats), 'name': name} + ret_dict['average'] = \ + ret_dict['total'] / float(len(cstats)) + ret_dict['perc_none'] = \ + ret_dict['number_none'] * 100.0 / len(stats) + else: + ret_dict = {'reported': 0} + return ret_dict + + def _print_stats(self, stats): + """ + print out formatted stats to console + + :param stats: dict of stats generated by _gen_stats + """ + print '[%(name)s] low: %(low)d, high: %(high)d, avg: ' \ + '%(average).1f, total: %(total)d, ' \ + 'Failed: %(perc_none).1f%%, no_result: %(number_none)d, ' \ + 'reported: %(reported)d' % stats + + def _ptime(self, timev=None): + """ + :param timev: a unix timestamp or None + :returns: a pretty string of the current time or provided time + """ + if timev: + return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timev)) + else: + return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) def get_devices(self, zone_filter, swift_dir, ring_name): """ @@ -125,10 +158,9 @@ class SwiftRecon(object): ring_sum = md5sum.hexdigest() recon = Scout("ringmd5", self.verbose, self.suppress_errors, self.timeout) - now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - print "[%s] Checking ring md5sum's on %s hosts..." % (now, len(hosts)) + print "[%s] Checking ring md5sums" % self._ptime() if self.verbose: - print "-> On disk md5sum: %s" % ring_sum + print "-> On disk %s md5sum: %s" % (ringfile, ring_sum) for url, response, status in self.pool.imap(recon.scout, hosts): if status == 200: stats[url] = response[ringfile] @@ -152,23 +184,18 @@ class SwiftRecon(object): :param hosts: set of hosts to check. in the format of: set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) """ - stats = {} + scan = {} recon = Scout("async", self.verbose, self.suppress_errors, self.timeout) - now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - print "[%s] Checking async pendings on %s hosts..." % (now, len(hosts)) + print "[%s] Checking async pendings" % self._ptime() for url, response, status in self.pool.imap(recon.scout, hosts): if status == 200: - stats[url] = response['async_pending'] - if len(stats) > 0: - low = min(stats.values()) - high = max(stats.values()) - total = sum(stats.values()) - average = total / len(stats) - print "Async stats: low: %d, high: %d, avg: %d, total: %d" % (low, - high, average, total) + scan[url] = response['async_pending'] + stats = self._gen_stats(scan.values(), 'async_pending') + if stats['reported'] > 0: + self._print_stats(stats) else: - print "Error: No hosts available or returned valid information." + print "[async_pending] - No hosts returned valid data." print "=" * 79 def umount_check(self, hosts): @@ -181,9 +208,8 @@ class SwiftRecon(object): stats = {} recon = Scout("unmounted", self.verbose, self.suppress_errors, self.timeout) - now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print "[%s] Getting unmounted drives from %s hosts..." % \ - (now, len(hosts)) + (self._ptime(), len(hosts)) for url, response, status in self.pool.imap(recon.scout, hosts): if status == 200: for i in response: @@ -193,6 +219,34 @@ class SwiftRecon(object): print "Not mounted: %s on %s" % (stats[host], node) print "=" * 79 + def expirer_check(self, hosts): + """ + Obtain and print expirer statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) + """ + stats = {'object_expiration_pass': [], 'expired_last_pass': []} + recon = Scout("expirer/%s" % self.server_type, self.verbose, + self.suppress_errors, self.timeout) + print "[%s] Checking on expirers" % self._ptime() + for url, response, status in self.pool.imap(recon.scout, hosts): + if status == 200: + stats['object_expiration_pass'].append( + response.get('object_expiration_pass')) + stats['expired_last_pass'].append( + response.get('expired_last_pass')) + for k in stats: + if stats[k]: + computed = self._gen_stats(stats[k], name=k) + if computed['reported'] > 0: + self._print_stats(computed) + else: + print "[%s] - No hosts returned valid data." % k + else: + print "[%s] - No hosts returned valid data." % k + print "=" * 79 + def replication_check(self, hosts): """ Obtain and print replication statistics @@ -200,24 +254,196 @@ class SwiftRecon(object): :param hosts: set of hosts to check. in the format of: set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) """ + stats = {'replication_time': [], 'failure': [], 'success': [], + 'attempted': []} + recon = Scout("replication/%s" % self.server_type, self.verbose, + self.suppress_errors, self.timeout) + print "[%s] Checking on replication" % self._ptime() + for url, response, status in self.pool.imap(recon.scout, hosts): + if status == 200: + stats['replication_time'].append( + response.get('replication_time')) + repl_stats = response['replication_stats'] + if repl_stats: + for stat_key in ['attempted', 'failure', 'success']: + stats[stat_key].append(repl_stats.get(stat_key)) + for k in stats: + if stats[k]: + if k != 'replication_time': + computed = self._gen_stats(stats[k], + name='replication_%s' % k) + else: + computed = self._gen_stats(stats[k], name=k) + if computed['reported'] > 0: + self._print_stats(computed) + else: + print "[%s] - No hosts returned valid data." % k + else: + print "[%s] - No hosts returned valid data." % k + print "=" * 79 + + def object_replication_check(self, hosts): + """ + Obtain and print replication statistics from object servers + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) + """ stats = {} recon = Scout("replication", self.verbose, self.suppress_errors, self.timeout) - now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - print "[%s] Checking replication times on %s hosts..." % \ - (now, len(hosts)) + print "[%s] Checking on replication" % self._ptime() for url, response, status in self.pool.imap(recon.scout, hosts): if status == 200: stats[url] = response['object_replication_time'] + times = [x for x in stats.values() if x is not None] + if len(stats) > 0 and len(times) > 0: + computed = self._gen_stats(times, 'replication_time') + if computed['reported'] > 0: + self._print_stats(computed) + else: + print "[replication_time] - No hosts returned valid data." + else: + print "[replication_time] - No hosts returned valid data." + print "=" * 79 + + def updater_check(self, hosts): + """ + Obtain and print updater statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) + """ + stats = [] + recon = Scout("updater/%s" % self.server_type, self.verbose, + self.suppress_errors, self.timeout) + print "[%s] Checking updater times" % self._ptime() + for url, response, status in self.pool.imap(recon.scout, hosts): + if status == 200: + if response['%s_updater_sweep' % self.server_type]: + stats.append(response['%s_updater_sweep' % + self.server_type]) if len(stats) > 0: - low = min(stats.values()) - high = max(stats.values()) - total = sum(stats.values()) - average = total / len(stats) - print "[Replication Times] shortest: %s, longest: %s, avg: %s" % \ - (low, high, average) + computed = self._gen_stats(stats, name='updater_last_sweep') + if computed['reported'] > 0: + self._print_stats(computed) + else: + print "[updater_last_sweep] - No hosts returned valid data." else: - print "Error: No hosts available or returned valid information." + print "[updater_last_sweep] - No hosts returned valid data." + print "=" * 79 + + def auditor_check(self, hosts): + """ + Obtain and print obj auditor statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) + """ + scan = {} + adone = '%s_auditor_pass_completed' % self.server_type + afail = '%s_audits_failed' % self.server_type + apass = '%s_audits_passed' % self.server_type + asince = '%s_audits_since' % self.server_type + recon = Scout("auditor/%s" % self.server_type, self.verbose, + self.suppress_errors, self.timeout) + print "[%s] Checking auditor stats" % self._ptime() + for url, response, status in self.pool.imap(recon.scout, hosts): + if status == 200: + scan[url] = response + if len(scan) < 1: + print "Error: No hosts available" + return + stats = {} + stats[adone] = [scan[i][adone] for i in scan + if scan[i][adone] is not None] + stats[afail] = [scan[i][afail] for i in scan + if scan[i][afail] is not None] + stats[apass] = [scan[i][apass] for i in scan + if scan[i][apass] is not None] + stats[asince] = [scan[i][asince] for i in scan + if scan[i][asince] is not None] + for k in stats: + if len(stats[k]) < 1: + print "[%s] - No hosts returned valid data." % k + else: + if k != asince: + computed = self._gen_stats(stats[k], k) + if computed['reported'] > 0: + self._print_stats(computed) + if len(stats[asince]) >= 1: + low = min(stats[asince]) + high = max(stats[asince]) + total = sum(stats[asince]) + average = total / len(stats[asince]) + print '[last_pass] oldest: %s, newest: %s, avg: %s' % \ + (self._ptime(low), self._ptime(high), self._ptime(average)) + print "=" * 79 + + def object_auditor_check(self, hosts): + """ + Obtain and print obj auditor statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) + """ + all_scan = {} + zbf_scan = {} + atime = 'audit_time' + bprocessed = 'bytes_processed' + passes = 'passes' + errors = 'errors' + quarantined = 'quarantined' + recon = Scout("auditor/object", self.verbose, self.suppress_errors, + self.timeout) + print "[%s] Checking auditor stats " % self._ptime() + for url, response, status in self.pool.imap(recon.scout, hosts): + if status == 200: + if response['object_auditor_stats_ALL']: + all_scan[url] = response['object_auditor_stats_ALL'] + if response['object_auditor_stats_ZBF']: + zbf_scan[url] = response['object_auditor_stats_ZBF'] + if len(all_scan) > 0: + stats = {} + stats[atime] = [all_scan[i][atime] for i in all_scan] + stats[bprocessed] = [all_scan[i][bprocessed] for i in all_scan] + stats[passes] = [all_scan[i][passes] for i in all_scan] + stats[errors] = [all_scan[i][errors] for i in all_scan] + stats[quarantined] = [all_scan[i][quarantined] for i in all_scan] + for k in stats: + if None in stats[k]: + stats[k] = [x for x in stats[k] if x is not None] + if len(stats[k]) < 1: + print "[Auditor %s] - No hosts returned valid data." % k + else: + computed = self._gen_stats(stats[k], + name='ALL_%s_last_path' % k) + if computed['reported'] > 0: + self._print_stats(computed) + else: + print "[ALL_auditor] - No hosts returned valid data." + else: + print "[ALL_auditor] - No hosts returned valid data." + if len(zbf_scan) > 0: + stats = {} + stats[atime] = [zbf_scan[i][atime] for i in zbf_scan] + stats[bprocessed] = [zbf_scan[i][bprocessed] for i in zbf_scan] + stats[errors] = [zbf_scan[i][errors] for i in zbf_scan] + stats[quarantined] = [zbf_scan[i][quarantined] for i in zbf_scan] + for k in stats: + if None in stats[k]: + stats[k] = [x for x in stats[k] if x is not None] + if len(stats[k]) < 1: + print "[Auditor %s] - No hosts returned valid data." % k + else: + computed = self._gen_stats(stats[k], + name='ZBF_%s_last_path' % k) + if computed['reported'] > 0: + self._print_stats(computed) + else: + print "[ZBF_auditor] - No hosts returned valid data." + else: + print "[ZBF_auditor] - No hosts returned valid data." print "=" * 79 def load_check(self, hosts): @@ -232,8 +458,7 @@ class SwiftRecon(object): load15 = {} recon = Scout("load", self.verbose, self.suppress_errors, self.timeout) - now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - print "[%s] Checking load avg's on %s hosts..." % (now, len(hosts)) + print "[%s] Checking load averages" % self._ptime() for url, response, status in self.pool.imap(recon.scout, hosts): if status == 200: load1[url] = response['1m'] @@ -242,14 +467,11 @@ class SwiftRecon(object): stats = {"1m": load1, "5m": load5, "15m": load15} for item in stats: if len(stats[item]) > 0: - low = min(stats[item].values()) - high = max(stats[item].values()) - total = sum(stats[item].values()) - average = total / len(stats[item]) - print "[%s load average] lowest: %s, highest: %s, avg: %s" % \ - (item, low, high, average) + computed = self._gen_stats(stats[item].values(), + name='%s_load_avg' % item) + self._print_stats(computed) else: - print "Error: No hosts available or returned valid info." + print "[%s_load_avg] - No hosts returned valid data." % item print "=" * 79 def quarantine_check(self, hosts): @@ -264,8 +486,7 @@ class SwiftRecon(object): acctq = {} recon = Scout("quarantined", self.verbose, self.suppress_errors, self.timeout) - now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - print "[%s] Checking quarantine on %s hosts..." % (now, len(hosts)) + print "[%s] Checking quarantine" % self._ptime() for url, response, status in self.pool.imap(recon.scout, hosts): if status == 200: objq[url] = response['objects'] @@ -274,14 +495,11 @@ class SwiftRecon(object): stats = {"objects": objq, "containers": conq, "accounts": acctq} for item in stats: if len(stats[item]) > 0: - low = min(stats[item].values()) - high = max(stats[item].values()) - total = sum(stats[item].values()) - average = total / len(stats[item]) - print ("[Quarantined %s] low: %d, high: %d, avg: %d, total: %d" - % (item, low, high, average, total)) + computed = self._gen_stats(stats[item].values(), + name='quarantined_%s' % item) + self._print_stats(computed) else: - print "Error: No hosts available or returned valid info." + print "No hosts returned valid data." print "=" * 79 def socket_usage(self, hosts): @@ -298,8 +516,7 @@ class SwiftRecon(object): orphan = {} recon = Scout("sockstat", self.verbose, self.suppress_errors, self.timeout) - now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - print "[%s] Checking socket usage on %s hosts..." % (now, len(hosts)) + print "[%s] Checking socket usage" % self._ptime() for url, response, status in self.pool.imap(recon.scout, hosts): if status == 200: inuse4[url] = response['tcp_in_use'] @@ -312,14 +529,10 @@ class SwiftRecon(object): "orphan": orphan} for item in stats: if len(stats[item]) > 0: - low = min(stats[item].values()) - high = max(stats[item].values()) - total = sum(stats[item].values()) - average = total / len(stats[item]) - print "[%s] low: %d, high: %d, avg: %d, total: %d" % \ - (item, low, high, average, total) + computed = self._gen_stats(stats[item].values(), item) + self._print_stats(computed) else: - print "Error: No hosts or info available." + print "No hosts returned valid data." print "=" * 79 def disk_usage(self, hosts): @@ -334,12 +547,10 @@ class SwiftRecon(object): lows = [] raw_total_used = [] raw_total_avail = [] - averages = [] percents = {} recon = Scout("diskusage", self.verbose, self.suppress_errors, self.timeout) - now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - print "[%s] Checking disk usage on %s hosts..." % (now, len(hosts)) + print "[%s] Checking disk usage now" % self._ptime() for url, response, status in self.pool.imap(recon.scout, hosts): if status == 200: hostusage = [] @@ -357,11 +568,8 @@ class SwiftRecon(object): #get per host hi/los for another day low = min(stats[url]) high = max(stats[url]) - total = sum(stats[url]) - average = total / len(stats[url]) highs.append(high) lows.append(low) - averages.append(average) for percent in stats[url]: percents[int(percent)] = percents.get(int(percent), 0) + 1 else: @@ -370,7 +578,6 @@ class SwiftRecon(object): if len(lows) > 0: low = min(lows) high = max(highs) - average = sum(averages) / len(averages) #dist graph shamelessly stolen from https://github.com/gholt/tcod print "Distribution Graph:" mul = 69.0 / max(percents.values()) @@ -380,12 +587,13 @@ class SwiftRecon(object): raw_used = sum(raw_total_used) raw_avail = sum(raw_total_avail) raw_total = raw_used + raw_avail + avg_used = 100.0 * raw_used / raw_total print "Disk usage: space used: %s of %s" % (raw_used, raw_total) print "Disk usage: space free: %s of %s" % (raw_avail, raw_total) print "Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" % \ - (low, high, average) + (low, high, avg_used) else: - print "Error: No hosts available or returned valid information." + print "No hosts returned valid data." print "=" * 79 def main(self): @@ -394,7 +602,13 @@ class SwiftRecon(object): """ print "=" * 79 usage = ''' - usage: %prog [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [--objmd5] + usage: %prog <server_type> [-v] [--suppress] [-a] [-r] [-u] [-d] + [-l] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] + + <server_type>\taccount|container|object + Defaults to object server. + + ex: %prog container -l --auditor ''' args = optparse.OptionParser(usage) args.add_option('--verbose', '-v', action="store_true", @@ -405,6 +619,12 @@ class SwiftRecon(object): help="Get async stats") args.add_option('--replication', '-r', action="store_true", help="Get replication stats") + args.add_option('--auditor', action="store_true", + help="Get auditor stats") + args.add_option('--updater', action="store_true", + help="Get updater stats") + args.add_option('--expirer', action="store_true", + help="Get expirer stats") args.add_option('--unmounted', '-u', action="store_true", help="Check cluster for unmounted devices") args.add_option('--diskusage', '-d', action="store_true", @@ -413,12 +633,12 @@ class SwiftRecon(object): help="Get cluster load average stats") args.add_option('--quarantined', '-q', action="store_true", help="Get cluster quarantine stats") - args.add_option('--objmd5', action="store_true", - help="Get md5sums of object.ring.gz and compare to local copy") + args.add_option('--md5', action="store_true", + help="Get md5sum of servers ring and compare to local copy") args.add_option('--sockstat', action="store_true", help="Get cluster socket usage stats") args.add_option('--all', action="store_true", - help="Perform all checks. Equal to -arudlq --objmd5 --sockstat") + help="Perform all checks. Equal to -arudlq --md5 --sockstat") args.add_option('--zone', '-z', type="int", help="Only query servers in specified zone") args.add_option('--timeout', '-t', type="int", metavar="SECONDS", @@ -427,44 +647,88 @@ class SwiftRecon(object): help="Default = /etc/swift") options, arguments = args.parse_args() - if len(sys.argv) <= 1: + if len(sys.argv) <= 1 or len(arguments) > 1: args.print_help() sys.exit(0) - swift_dir = options.swiftdir - obj_ring = os.path.join(swift_dir, 'object.ring.gz') + if arguments: + if arguments[0] in self.check_types: + self.server_type = arguments[0] + else: + print "Invalid Server Type" + args.print_help() + sys.exit(1) + else: + self.server_type = 'object' + swift_dir = options.swiftdir + ring_file = os.path.join(swift_dir, '%s.ring.gz' % self.server_type) self.verbose = options.verbose self.suppress_errors = options.suppress self.timeout = options.timeout if options.zone: - hosts = self.get_devices(options.zone, swift_dir, 'object') + hosts = self.get_devices(options.zone, swift_dir, self.server_type) else: - hosts = self.get_devices(None, swift_dir, 'object') + hosts = self.get_devices(None, swift_dir, self.server_type) + + print "--> Starting reconnaissance on %s hosts" % len(hosts) + print "=" * 79 if options.all: - self.async_check(hosts) + if self.server_type == 'object': + self.async_check(hosts) + self.object_replication_check(hosts) + self.object_auditor_check(hosts) + self.updater_check(hosts) + self.expirer_check(hosts) + elif self.server_type == 'container': + self.replication_check(hosts) + self.auditor_check(hosts) + self.updater_check(hosts) + elif self.server_type == 'account': + self.replication_check(hosts) + self.auditor_check(hosts) self.umount_check(hosts) - self.replication_check(hosts) self.load_check(hosts) self.disk_usage(hosts) - self.get_ringmd5(hosts, obj_ring) + self.get_ringmd5(hosts, ring_file) self.quarantine_check(hosts) self.socket_usage(hosts) else: if options.async: - self.async_check(hosts) + if self.server_type == 'object': + self.async_check(hosts) + else: + print "Error: Can't check async's on non object servers." if options.unmounted: self.umount_check(hosts) if options.replication: - self.replication_check(hosts) + if self.server_type == 'object': + self.object_replication_check(hosts) + else: + self.replication_check(hosts) + if options.auditor: + if self.server_type == 'object': + self.object_auditor_check(hosts) + else: + self.auditor_check(hosts) + if options.updater: + if self.server_type == 'account': + print "Error: Can't check updaters on account servers." + else: + self.updater_check(hosts) + if options.expirer: + if self.server_type == 'object': + self.expirer_check(hosts) + else: + print "Error: Can't check expired on non object servers." if options.loadstats: self.load_check(hosts) if options.diskusage: self.disk_usage(hosts) - if options.objmd5: - self.get_ringmd5(hosts, obj_ring) + if options.md5: + self.get_ringmd5(hosts, ring_file) if options.quarantined: self.quarantine_check(hosts) if options.sockstat: |