summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xbin/swift-dispersion-populate152
-rwxr-xr-xbin/swift-dispersion-report249
-rwxr-xr-xbin/swift-stats-populate7
-rwxr-xr-xbin/swift-stats-report5
-rw-r--r--doc/source/admin_guide.rst39
-rw-r--r--etc/dispersion.conf-sample8
-rw-r--r--setup.py1
7 files changed, 437 insertions, 24 deletions
diff --git a/bin/swift-dispersion-populate b/bin/swift-dispersion-populate
new file mode 100755
index 000000000..fe9fc56f0
--- /dev/null
+++ b/bin/swift-dispersion-populate
@@ -0,0 +1,152 @@
+#!/usr/bin/python -u
+# Copyright (c) 2010-2011 OpenStack, LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import traceback
+from ConfigParser import ConfigParser
+from cStringIO import StringIO
+from sys import exit, argv
+from time import time
+from uuid import uuid4
+
+from eventlet import GreenPool, patcher, sleep
+from eventlet.pools import Pool
+
+from swift.common.client import Connection, get_auth
+from swift.common.ring import Ring
+from swift.common.utils import compute_eta, get_time_units
+
+
+def put_container(connpool, container, report):
+ global retries_done
+ try:
+ with connpool.item() as conn:
+ conn.put_container(container)
+ retries_done += conn.attempts - 1
+ if report:
+ report(True)
+ except Exception:
+ if report:
+ report(False)
+ raise
+
+
+def put_object(connpool, container, obj, report):
+ global retries_done
+ try:
+ with connpool.item() as conn:
+ conn.put_object(container, obj, StringIO(obj),
+ headers={'x-object-meta-dispersion': obj})
+ retries_done += conn.attempts - 1
+ if report:
+ report(True)
+ except Exception:
+ if report:
+ report(False)
+ raise
+
+
+def report(success):
+ global begun, created, item_type, next_report, need_to_create, retries_done
+ if not success:
+ traceback.print_exc()
+ exit('Gave up due to error(s).')
+ created += 1
+ if time() < next_report:
+ return
+ next_report = time() + 5
+ eta, eta_unit = compute_eta(begun, created, need_to_create)
+ print '\r\x1B[KCreating %s: %d of %d, %d%s left, %d retries' % (item_type,
+ created, need_to_create, round(eta), eta_unit, retries_done),
+
+
+if __name__ == '__main__':
+ global begun, created, item_type, next_report, need_to_create, retries_done
+ patcher.monkey_patch()
+
+ conffile = '/etc/swift/dispersion.conf'
+ if len(argv) == 2:
+ conffile = argv[1]
+ elif len(argv) > 2:
+ exit('Syntax: %s [conffile]' % argv[0])
+ c = ConfigParser()
+ if not c.read(conffile):
+ exit('Unable to read config file: %s' % conffile)
+ conf = dict(c.items('dispersion'))
+ swift_dir = conf.get('swift_dir', '/etc/swift')
+ dispersion_coverage = int(conf.get('dispersion_coverage', 1))
+ retries = int(conf.get('retries', 5))
+ concurrency = int(conf.get('concurrency', 25))
+
+ coropool = GreenPool(size=concurrency)
+ retries_done = 0
+
+ url, token = get_auth(conf['auth_url'], conf['auth_user'],
+ conf['auth_key'])
+ account = url.rsplit('/', 1)[1]
+ connpool = Pool(max_size=concurrency)
+ connpool.create = lambda: Connection(conf['auth_url'],
+ conf['auth_user'], conf['auth_key'],
+ retries=retries,
+ preauthurl=url, preauthtoken=token)
+
+ container_ring = Ring(os.path.join(swift_dir, 'container.ring.gz'))
+ parts_left = dict((x, x) for x in xrange(container_ring.partition_count))
+ item_type = 'containers'
+ created = 0
+ retries_done = 0
+ need_to_create = need_to_queue = \
+ dispersion_coverage / 100.0 * container_ring.partition_count
+ begun = next_report = time()
+ next_report += 2
+ while need_to_queue >= 1:
+ container = 'dispersion_%s' % uuid4().hex
+ part, _junk = container_ring.get_nodes(account, container)
+ if part in parts_left:
+ coropool.spawn(put_container, connpool, container, report)
+ sleep()
+ del parts_left[part]
+ need_to_queue -= 1
+ coropool.waitall()
+ elapsed, elapsed_unit = get_time_units(time() - begun)
+ print '\r\x1B[KCreated %d containers for dispersion reporting, %d%s, %d ' \
+ 'retries' % \
+ (need_to_create, round(elapsed), elapsed_unit, retries_done)
+
+ container = 'dispersion_objects'
+ put_container(connpool, container, None)
+ object_ring = Ring(os.path.join(swift_dir, 'object.ring.gz'))
+ parts_left = dict((x, x) for x in xrange(object_ring.partition_count))
+ item_type = 'objects'
+ created = 0
+ retries_done = 0
+ need_to_create = need_to_queue = \
+ dispersion_coverage / 100.0 * object_ring.partition_count
+ begun = next_report = time()
+ next_report += 2
+ while need_to_queue >= 1:
+ obj = 'dispersion_%s' % uuid4().hex
+ part, _junk = object_ring.get_nodes(account, container, obj)
+ if part in parts_left:
+ coropool.spawn(put_object, connpool, container, obj, report)
+ sleep()
+ del parts_left[part]
+ need_to_queue -= 1
+ coropool.waitall()
+ elapsed, elapsed_unit = get_time_units(time() - begun)
+ print '\r\x1B[KCreated %d objects for dispersion reporting, %d%s, %d ' \
+ 'retries' % \
+ (need_to_create, round(elapsed), elapsed_unit, retries_done)
diff --git a/bin/swift-dispersion-report b/bin/swift-dispersion-report
new file mode 100755
index 000000000..2ec4c3161
--- /dev/null
+++ b/bin/swift-dispersion-report
@@ -0,0 +1,249 @@
+#!/usr/bin/python -u
+# Copyright (c) 2010-2011 OpenStack, LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import os
+import socket
+from ConfigParser import ConfigParser
+from httplib import HTTPException
+from optparse import OptionParser
+from sys import argv, exit, stderr
+from time import time
+from uuid import uuid4
+
+from eventlet import GreenPool, hubs, patcher, sleep, Timeout
+from eventlet.pools import Pool
+
+from swift.common import direct_client
+from swift.common.client import ClientException, Connection, get_auth
+from swift.common.ring import Ring
+from swift.common.utils import compute_eta, get_time_units
+
+
+unmounted = []
+
+def get_error_log(prefix):
+ def error_log(msg_or_exc):
+ global unmounted
+ if hasattr(msg_or_exc, 'http_status') and \
+ msg_or_exc.http_status == 507:
+ identifier = '%s:%s/%s'
+ if identifier not in unmounted:
+ unmounted.append(identifier)
+ print >>stderr, 'ERROR: %s:%s/%s is unmounted -- This will ' \
+ 'cause replicas designated for that device to be ' \
+ 'considered missing until resolved or the ring is ' \
+ 'updated.' % (msg_or_exc.http_host, msg_or_exc.http_port,
+ msg_or_exc.http_device)
+ if not hasattr(msg_or_exc, 'http_status') or \
+ msg_or_exc.http_status not in (404, 507):
+ print >>stderr, 'ERROR: %s: %s' % (prefix, msg_or_exc)
+ return error_log
+
+
+def container_dispersion_report(coropool, connpool, account, container_ring,
+ retries):
+ with connpool.item() as conn:
+ containers = [c['name'] for c in conn.get_account(prefix='dispersion_',
+ full_listing=True)[1]]
+ containers_listed = len(containers)
+ if not containers_listed:
+ print >>stderr, 'No containers to query. Has ' \
+ 'swift-dispersion-populate been run?'
+ return
+ retries_done = [0]
+ containers_queried = [0]
+ container_copies_found = [0, 0, 0, 0]
+ begun = time()
+ next_report = [time() + 2]
+ def direct(container, part, nodes):
+ found_count = 0
+ for node in nodes:
+ error_log = get_error_log('%(ip)s:%(port)s/%(device)s' % node)
+ try:
+ attempts, _junk = direct_client.retry(
+ direct_client.direct_head_container, node,
+ part, account, container, error_log=error_log,
+ retries=retries)
+ retries_done[0] += attempts - 1
+ found_count += 1
+ except ClientException, err:
+ if err.http_status not in (404, 507):
+ error_log('Giving up on /%s/%s/%s: %s' % (part, account,
+ container, err))
+ except (Exception, Timeout), err:
+ error_log('Giving up on /%s/%s/%s: %s' % (part, account,
+ container, err))
+ container_copies_found[found_count] += 1
+ containers_queried[0] += 1
+ if time() >= next_report[0]:
+ next_report[0] = time() + 5
+ eta, eta_unit = compute_eta(begun, containers_queried[0],
+ containers_listed)
+ print '\r\x1B[KQuerying containers: %d of %d, %d%s left, %d ' \
+ 'retries' % (containers_queried[0], containers_listed,
+ round(eta), eta_unit, retries_done[0]),
+ container_parts = {}
+ for container in containers:
+ part, nodes = container_ring.get_nodes(account, container)
+ if part not in container_parts:
+ container_parts[part] = part
+ coropool.spawn(direct, container, part, nodes)
+ coropool.waitall()
+ distinct_partitions = len(container_parts)
+ copies_expected = distinct_partitions * container_ring.replica_count
+ copies_found = sum(a * b for a, b in enumerate(container_copies_found))
+ value = 100.0 * copies_found / copies_expected
+ elapsed, elapsed_unit = get_time_units(time() - begun)
+ print '\r\x1B[KQueried %d containers for dispersion reporting, ' \
+ '%d%s, %d retries' % (containers_listed, round(elapsed),
+ elapsed_unit, retries_done[0])
+ if containers_listed - distinct_partitions:
+ print 'There were %d overlapping partitions' % (
+ containers_listed - distinct_partitions)
+ if container_copies_found[2]:
+ print 'There were %d partitions missing one copy.' % \
+ container_copies_found[2]
+ if container_copies_found[1]:
+ print '! There were %d partitions missing two copies.' % \
+ container_copies_found[1]
+ if container_copies_found[0]:
+ print '!!! There were %d partitions missing all copies.' % \
+ container_copies_found[0]
+ print '%.02f%% of container copies found (%d of %d)' % (
+ value, copies_found, copies_expected)
+ print 'Sample represents %.02f%% of the container partition space' % (
+ 100.0 * distinct_partitions / container_ring.partition_count)
+
+
+def object_dispersion_report(coropool, connpool, account, object_ring,
+ retries):
+ container = 'dispersion_objects'
+ with connpool.item() as conn:
+ try:
+ objects = [o['name'] for o in conn.get_container(container,
+ prefix='dispersion_', full_listing=True)[1]]
+ except ClientException, err:
+ if err.http_status != 404:
+ raise
+ print >>stderr, 'No objects to query. Has ' \
+ 'swift-dispersion-populate been run?'
+ return
+ objects_listed = len(objects)
+ if not objects_listed:
+ print >>stderr, 'No objects to query. Has swift-dispersion-populate ' \
+ 'been run?'
+ return
+ retries_done = [0]
+ objects_queried = [0]
+ object_copies_found = [0, 0, 0, 0]
+ begun = time()
+ next_report = [time() + 2]
+ def direct(obj, part, nodes):
+ found_count = 0
+ for node in nodes:
+ error_log = get_error_log('%(ip)s:%(port)s/%(device)s' % node)
+ try:
+ attempts, _junk = direct_client.retry(
+ direct_client.direct_head_object, node, part,
+ account, container, obj, error_log=error_log,
+ retries=retries)
+ retries_done[0] += attempts - 1
+ found_count += 1
+ except ClientException, err:
+ if err.http_status not in (404, 507):
+ error_log('Giving up on /%s/%s/%s/%s: %s' % (part, account,
+ container, obj, err))
+ except (Exception, Timeout), err:
+ error_log('Giving up on /%s/%s/%s/%s: %s' % (part, account,
+ container, obj, err))
+ object_copies_found[found_count] += 1
+ objects_queried[0] += 1
+ if time() >= next_report[0]:
+ next_report[0] = time() + 5
+ eta, eta_unit = compute_eta(begun, objects_queried[0],
+ objects_listed)
+ print '\r\x1B[KQuerying objects: %d of %d, %d%s left, %d ' \
+ 'retries' % (objects_queried[0], objects_listed, round(eta),
+ eta_unit, retries_done[0]),
+ object_parts = {}
+ for obj in objects:
+ part, nodes = object_ring.get_nodes(account, container, obj)
+ if part not in object_parts:
+ object_parts[part] = part
+ coropool.spawn(direct, obj, part, nodes)
+ coropool.waitall()
+ distinct_partitions = len(object_parts)
+ copies_expected = distinct_partitions * object_ring.replica_count
+ copies_found = sum(a * b for a, b in enumerate(object_copies_found))
+ value = 100.0 * copies_found / copies_expected
+ elapsed, elapsed_unit = get_time_units(time() - begun)
+ print '\r\x1B[KQueried %d objects for dispersion reporting, ' \
+ '%d%s, %d retries' % (objects_listed, round(elapsed),
+ elapsed_unit, retries_done[0])
+ if objects_listed - distinct_partitions:
+ print 'There were %d overlapping partitions' % (
+ objects_listed - distinct_partitions)
+ if object_copies_found[2]:
+ print 'There were %d partitions missing one copy.' % \
+ object_copies_found[2]
+ if object_copies_found[1]:
+ print '! There were %d partitions missing two copies.' % \
+ object_copies_found[1]
+ if object_copies_found[0]:
+ print '!!! There were %d partitions missing all copies.' % \
+ object_copies_found[0]
+ print '%.02f%% of object copies found (%d of %d)' % \
+ (value, copies_found, copies_expected)
+ print 'Sample represents %.02f%% of the object partition space' % (
+ 100.0 * distinct_partitions / object_ring.partition_count)
+
+
+if __name__ == '__main__':
+ patcher.monkey_patch()
+ hubs.get_hub().debug_exceptions = False
+
+ conffile = '/etc/swift/dispersion.conf'
+ if len(argv) == 2:
+ conffile = argv[1]
+ elif len(argv) > 2:
+ exit('Syntax: %s [conffile]' % argv[0])
+ c = ConfigParser()
+ if not c.read(conffile):
+ exit('Unable to read config file: %s' % conffile)
+ conf = dict(c.items('dispersion'))
+ swift_dir = conf.get('swift_dir', '/etc/swift')
+ dispersion_coverage = int(conf.get('dispersion_coverage', 1))
+ retries = int(conf.get('retries', 5))
+ concurrency = int(conf.get('concurrency', 25))
+
+ coropool = GreenPool(size=concurrency)
+
+ url, token = get_auth(conf['auth_url'], conf['auth_user'],
+ conf['auth_key'])
+ account = url.rsplit('/', 1)[1]
+ connpool = Pool(max_size=concurrency)
+ connpool.create = lambda: Connection(conf['auth_url'],
+ conf['auth_user'], conf['auth_key'],
+ retries=retries,
+ preauthurl=url, preauthtoken=token)
+
+ container_ring = Ring(os.path.join(swift_dir, 'container.ring.gz'))
+ object_ring = Ring(os.path.join(swift_dir, 'object.ring.gz'))
+
+ container_dispersion_report(coropool, connpool, account, container_ring,
+ retries)
+ object_dispersion_report(coropool, connpool, account, object_ring, retries)
diff --git a/bin/swift-stats-populate b/bin/swift-stats-populate
index b1f4f0a56..080f159f8 100755
--- a/bin/swift-stats-populate
+++ b/bin/swift-stats-populate
@@ -18,7 +18,7 @@ import os
import traceback
from ConfigParser import ConfigParser
from optparse import OptionParser
-from sys import exit, argv
+from sys import exit, argv, stderr
from time import time
from uuid import uuid4
@@ -77,6 +77,11 @@ if __name__ == '__main__':
global begun, created, item_type, next_report, need_to_create, retries_done
patcher.monkey_patch()
+ print >>stderr, '''
+WARNING: This command is being replaced with swift-dispersion-populate; you
+should switch to that before the next Swift release.
+ '''
+
parser = OptionParser()
parser.add_option('-d', '--dispersion', action='store_true',
dest='dispersion', default=False,
diff --git a/bin/swift-stats-report b/bin/swift-stats-report
index 4c47b404d..e9328a135 100755
--- a/bin/swift-stats-report
+++ b/bin/swift-stats-report
@@ -749,6 +749,11 @@ if __name__ == '__main__':
patcher.monkey_patch()
hubs.get_hub().debug_exceptions = False
+ print >>stderr, '''
+WARNING: This command is being replaced with swift-dispersion-report; you
+should switch to that before the next Swift release.
+ '''
+
parser = OptionParser(usage='''
Usage: %prog [options] [conf_file]
diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst
index bb3eef6fa..ab112bbb3 100644
--- a/doc/source/admin_guide.rst
+++ b/doc/source/admin_guide.rst
@@ -134,9 +134,9 @@ different distro or OS, some care should be taken before using in production.
Cluster Health
--------------
-There is a swift-stats-report tool for measuring overall cluster health. This
-is accomplished by checking if a set of deliberately distributed containers and
-objects are currently in their proper places within the cluster.
+There is a swift-dispersion-report tool for measuring overall cluster health.
+This is accomplished by checking if a set of deliberately distributed
+containers and objects are currently in their proper places within the cluster.
For instance, a common deployment has three replicas of each object. The health
of that object can be measured by checking if each replica is in its proper
@@ -153,15 +153,15 @@ to gather results.
The first thing that needs to be done to provide this health value is create a
new account solely for this usage. Next, we need to place the containers and
objects throughout the system so that they are on distinct partitions. The
-swift-stats-populate tool does this by making up random container and object
-names until they fall on distinct partitions. Last, and repeatedly for the life
-of the cluster, we need to run the swift-stats-report tool to check the health
-of each of these containers and objects.
+swift-dispersion-populate tool does this by making up random container and
+object names until they fall on distinct partitions. Last, and repeatedly for
+the life of the cluster, we need to run the swift-dispersion-report tool to
+check the health of each of these containers and objects.
These tools need direct access to the entire cluster and to the ring files
(installing them on a proxy server will probably do). Both
-swift-stats-populate and swift-stats-report use the same configuration file,
-/etc/swift/stats.conf. Example conf file::
+swift-dispersion-populate and swift-dispersion-report use the same
+configuration file, /etc/swift/dispersion.conf. Example conf file::
[stats]
auth_url = http://saio:11000/auth/v1.0
@@ -169,17 +169,17 @@ swift-stats-populate and swift-stats-report use the same configuration file,
auth_key = testing
There are also options for the conf file for specifying the dispersion coverage
-(defaults to 1%), retries, concurrency, CSV output file, etc. though usually
-the defaults are fine.
+(defaults to 1%), retries, concurrency, etc. though usually the defaults are
+fine.
-Once the configuration is in place, run `swift-stats-populate -d` to populate
+Once the configuration is in place, run `swift-dispersion-populate` to populate
the containers and objects throughout the cluster.
Now that those containers and objects are in place, you can run
-`swift-stats-report -d` to get a dispersion report, or the overall health of
+`swift-dispersion-report` to get a dispersion report, or the overall health of
the cluster. Here is an example of a cluster in perfect health::
- $ swift-stats-report -d
+ $ swift-dispersion-report
Queried 2621 containers for dispersion reporting, 19s, 0 retries
100.00% of container copies found (7863 of 7863)
Sample represents 1.00% of the container partition space
@@ -195,7 +195,7 @@ that has::
$ swift-ring-builder object.builder set_weight d0 200
$ swift-ring-builder object.builder rebalance
...
- $ swift-stats-report -d
+ $ swift-dispersion-report
Queried 2621 containers for dispersion reporting, 8s, 0 retries
100.00% of container copies found (7863 of 7863)
Sample represents 1.00% of the container partition space
@@ -212,7 +212,7 @@ is much less. Next, I'll run the replicators to get everything put back into
place and then rerun the dispersion report::
... start object replicators and monitor logs until they're caught up ...
- $ swift-stats-report -d
+ $ swift-dispersion-report
Queried 2621 containers for dispersion reporting, 17s, 0 retries
100.00% of container copies found (7863 of 7863)
Sample represents 1.00% of the container partition space
@@ -221,13 +221,6 @@ place and then rerun the dispersion report::
100.00% of object copies found (7857 of 7857)
Sample represents 1.00% of the object partition space
-So that's a summation of how to use swift-stats-report to monitor the health of
-a cluster. There are a few other things it can do, such as performance
-monitoring, but those are currently in their infancy and little used. For
-instance, you can run `swift-stats-populate -p` and `swift-stats-report -p` to
-get performance timings (warning: the initial populate takes a while). These
-timings are dumped into a CSV file (/etc/swift/stats.csv by default) and can
-then be graphed to see how cluster performance is trending.
------------------------------------
Additional Cleanup Script for Swauth
diff --git a/etc/dispersion.conf-sample b/etc/dispersion.conf-sample
new file mode 100644
index 000000000..09c4290e7
--- /dev/null
+++ b/etc/dispersion.conf-sample
@@ -0,0 +1,8 @@
+[dispersion]
+auth_url = http://saio:8080/auth/v1.0
+auth_user = test:tester
+auth_key = testing
+# swift_dir = /etc/swift
+# dispersion_coverage = 1
+# retries = 5
+# concurrency = 25
diff --git a/setup.py b/setup.py
index ccd1d4dd9..93d13a16a 100644
--- a/setup.py
+++ b/setup.py
@@ -90,6 +90,7 @@ setup(
'bin/swift-object-updater', 'bin/swift-proxy-server',
'bin/swift-ring-builder', 'bin/swift-stats-populate',
'bin/swift-stats-report',
+ 'bin/swift-dispersion-populate', 'bin/swift-dispersion-report',
'bin/swift-bench',
'bin/swift-log-uploader',
'bin/swift-log-stats-collector',