summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJenkins <jenkins@review.openstack.org>2017-02-15 03:54:49 +0000
committerGerrit Code Review <review@openstack.org>2017-02-15 03:54:49 +0000
commitcdd72dd34f879c7340d4c379db955ea82db3cfe2 (patch)
tree8ca1c33d7333b25bc0479b10a825a2165af23850
parent801d281a4aebb2ce8fb13bae522cfee1353938af (diff)
parentda557011ecc1ec46bce24bd896ec50cbed914ba6 (diff)
downloadswift-cdd72dd34f879c7340d4c379db955ea82db3cfe2.tar.gz
Merge "Deprecate broken handoffs_first in favor of handoffs_only"
-rw-r--r--etc/object-server.conf-sample13
-rw-r--r--swift/obj/reconstructor.py47
-rw-r--r--test/unit/obj/test_reconstructor.py101
3 files changed, 145 insertions, 16 deletions
diff --git a/etc/object-server.conf-sample b/etc/object-server.conf-sample
index ad52c2430..f03e5af1d 100644
--- a/etc/object-server.conf-sample
+++ b/etc/object-server.conf-sample
@@ -298,7 +298,18 @@ use = egg:swift#recon
# lockup_timeout = 1800
# ring_check_interval = 15
# recon_cache_path = /var/cache/swift
-# handoffs_first = False
+# The handoffs_only mode option is for special case emergency situations during
+# rebalance such as disk full in the cluster. This option SHOULD NOT BE
+# CHANGED, except for extreme situations. When handoffs_only mode is enabled
+# the reconstructor will *only* revert rebalance fragments to primaries and not
+# attempt to sync any primary parts with neighbor primaries. This will force
+# the reconstructor to sync and delete handoffs fragments more quickly and
+# minimize the time of the rebalance by limiting the number of rebuilds. The
+# handoffs_only option is only for temporary use, it should be disabled as soon
+# as the emergency situation is resolved. When handoffs_only is not set, the
+# deprecated handoffs_first option will be honored as a synonym, but may be
+# ignored in a future release.
+# handoffs_only = False
#
# You can set scheduling priority of processes. Niceness values range from -20
# (most favorable to the process) to 19 (least favorable to the process).
diff --git a/swift/obj/reconstructor.py b/swift/obj/reconstructor.py
index 2460a5e6f..b0f35cb6d 100644
--- a/swift/obj/reconstructor.py
+++ b/swift/obj/reconstructor.py
@@ -149,8 +149,24 @@ class ObjectReconstructor(Daemon):
self.headers = {
'Content-Length': '0',
'user-agent': 'obj-reconstructor %s' % os.getpid()}
- self.handoffs_first = config_true_value(conf.get('handoffs_first',
- False))
+ if 'handoffs_first' in conf:
+ self.logger.warning(
+ 'The handoffs_first option is deprecated in favor '
+ 'of handoffs_only. This option may be ignored in a '
+ 'future release.')
+ # honor handoffs_first for backwards compatibility
+ default_handoffs_only = config_true_value(conf['handoffs_first'])
+ else:
+ default_handoffs_only = False
+ self.handoffs_only = config_true_value(
+ conf.get('handoffs_only', default_handoffs_only))
+ if self.handoffs_only:
+ self.logger.warning(
+ 'Handoff only mode is not intended for normal '
+ 'operation, use handoffs_only with care.')
+ elif default_handoffs_only:
+ self.logger.warning('Ignored handoffs_first option in favor '
+ 'of handoffs_only.')
self._df_router = DiskFileRouter(conf, self.logger)
def load_object_ring(self, policy):
@@ -656,6 +672,8 @@ class ObjectReconstructor(Daemon):
if syncd_with >= len(job['sync_to']):
self.delete_reverted_objs(
job, reverted_objs, job['frag_index'])
+ else:
+ self.handoffs_remaining += 1
self.logger.timing_since('partition.delete.timing', begin)
def _get_part_jobs(self, local_dev, part_path, partition, policy):
@@ -684,6 +702,9 @@ class ObjectReconstructor(Daemon):
:param policy: the policy
:returns: a list of dicts of job info
+
+ N.B. If this function ever returns an empty list of jobs the entire
+ partition will be deleted.
"""
# find all the fi's in the part, and which suffixes have them
try:
@@ -876,12 +897,12 @@ class ObjectReconstructor(Daemon):
"""
Helper function for collect_jobs to build jobs for reconstruction
using EC style storage policy
+
+ N.B. If this function ever returns an empty list of jobs the entire
+ partition will be deleted.
"""
jobs = self._get_part_jobs(**part_info)
random.shuffle(jobs)
- if self.handoffs_first:
- # Move the handoff revert jobs to the front of the list
- jobs.sort(key=lambda job: job['job_type'], reverse=True)
self.job_count += len(jobs)
return jobs
@@ -897,6 +918,7 @@ class ObjectReconstructor(Daemon):
self.reconstruction_part_count = 0
self.reconstruction_device_count = 0
self.last_reconstruction_count = -1
+ self.handoffs_remaining = 0
def delete_partition(self, path):
self.logger.info(_("Removing partition: %s"), path)
@@ -932,6 +954,11 @@ class ObjectReconstructor(Daemon):
self.run_pool.spawn(self.delete_partition,
part_info['part_path'])
for job in jobs:
+ if (self.handoffs_only and job['job_type'] != REVERT):
+ self.logger.debug('Skipping %s job for %s '
+ 'while in handoffs_only mode.',
+ job['job_type'], job['path'])
+ continue
self.run_pool.spawn(self.process_job, job)
with Timeout(self.lockup_timeout):
self.run_pool.waitall()
@@ -943,6 +970,16 @@ class ObjectReconstructor(Daemon):
stats.kill()
lockup_detector.kill()
self.stats_line()
+ if self.handoffs_only:
+ if self.handoffs_remaining > 0:
+ self.logger.info(_(
+ "Handoffs only mode still has handoffs remaining. "
+ "Next pass will continue to revert handoffs."))
+ else:
+ self.logger.warning(_(
+ "Handoffs only mode found no handoffs remaining. "
+ "You should disable handoffs_only once all nodes "
+ "are reporting no handoffs remaining."))
def run_once(self, *args, **kwargs):
start = time.time()
diff --git a/test/unit/obj/test_reconstructor.py b/test/unit/obj/test_reconstructor.py
index f6d040fa7..2b6872ce4 100644
--- a/test/unit/obj/test_reconstructor.py
+++ b/test/unit/obj/test_reconstructor.py
@@ -719,7 +719,6 @@ class TestGlobalSetupObjectReconstructor(unittest.TestCase):
rmtree(testring, ignore_errors=1)
def test_build_reconstruction_jobs(self):
- self.reconstructor.handoffs_first = False
self.reconstructor._reset_stats()
for part_info in self.reconstructor.collect_parts():
jobs = self.reconstructor.build_reconstruction_jobs(part_info)
@@ -728,13 +727,40 @@ class TestGlobalSetupObjectReconstructor(unittest.TestCase):
object_reconstructor.REVERT))
self.assert_expected_jobs(part_info['partition'], jobs)
- self.reconstructor.handoffs_first = True
- self.reconstructor._reset_stats()
- for part_info in self.reconstructor.collect_parts():
- jobs = self.reconstructor.build_reconstruction_jobs(part_info)
- self.assertTrue(jobs[0]['job_type'] ==
- object_reconstructor.REVERT)
- self.assert_expected_jobs(part_info['partition'], jobs)
+ def test_handoffs_only(self):
+ self.reconstructor.handoffs_only = True
+
+ found_job_types = set()
+
+ def fake_process_job(job):
+ # increment failure counter
+ self.reconstructor.handoffs_remaining += 1
+ found_job_types.add(job['job_type'])
+
+ self.reconstructor.process_job = fake_process_job
+
+ # only revert jobs
+ self.reconstructor.reconstruct()
+ self.assertEqual(found_job_types, {object_reconstructor.REVERT})
+ # but failures keep handoffs remaining
+ msgs = self.reconstructor.logger.get_lines_for_level('info')
+ self.assertIn('Next pass will continue to revert handoffs', msgs[-1])
+ self.logger._clear()
+
+ found_job_types = set()
+
+ def fake_process_job(job):
+ # success does not increment failure counter
+ found_job_types.add(job['job_type'])
+
+ self.reconstructor.process_job = fake_process_job
+
+ # only revert jobs ... but all handoffs cleared out successfully
+ self.reconstructor.reconstruct()
+ self.assertEqual(found_job_types, {object_reconstructor.REVERT})
+ # it's time to turn off handoffs_only
+ msgs = self.reconstructor.logger.get_lines_for_level('warning')
+ self.assertIn('You should disable handoffs_only', msgs[-1])
def test_get_partners(self):
# we're going to perform an exhaustive test of every possible
@@ -1156,6 +1182,57 @@ class TestObjectReconstructor(unittest.TestCase):
def ts(self):
return next(self.ts_iter)
+ def test_handoffs_only_default(self):
+ # sanity neither option added to default conf
+ self.conf.pop('handoffs_only', None)
+ self.conf.pop('handoffs_first', None)
+ self.reconstructor = object_reconstructor.ObjectReconstructor(
+ self.conf, logger=self.logger)
+ self.assertFalse(self.reconstructor.handoffs_only)
+
+ def test_handoffs_first_enables_handoffs_only(self):
+ self.conf.pop('handoffs_only', None) # sanity
+ self.conf['handoffs_first'] = True
+ self.reconstructor = object_reconstructor.ObjectReconstructor(
+ self.conf, logger=self.logger)
+ self.assertTrue(self.reconstructor.handoffs_only)
+ warnings = self.logger.get_lines_for_level('warning')
+ expected = [
+ 'The handoffs_first option is deprecated in favor '
+ 'of handoffs_only. This option may be ignored in a '
+ 'future release.',
+ 'Handoff only mode is not intended for normal operation, '
+ 'use handoffs_only with care.',
+ ]
+ self.assertEqual(expected, warnings)
+
+ def test_handoffs_only_ignores_handoffs_first(self):
+ self.conf['handoffs_first'] = True
+ self.conf['handoffs_only'] = False
+ self.reconstructor = object_reconstructor.ObjectReconstructor(
+ self.conf, logger=self.logger)
+ self.assertFalse(self.reconstructor.handoffs_only)
+ warnings = self.logger.get_lines_for_level('warning')
+ expected = [
+ 'The handoffs_first option is deprecated in favor of '
+ 'handoffs_only. This option may be ignored in a future release.',
+ 'Ignored handoffs_first option in favor of handoffs_only.',
+ ]
+ self.assertEqual(expected, warnings)
+
+ def test_handoffs_only_enabled(self):
+ self.conf.pop('handoffs_first', None) # sanity
+ self.conf['handoffs_only'] = True
+ self.reconstructor = object_reconstructor.ObjectReconstructor(
+ self.conf, logger=self.logger)
+ self.assertTrue(self.reconstructor.handoffs_only)
+ warnings = self.logger.get_lines_for_level('warning')
+ expected = [
+ 'Handoff only mode is not intended for normal operation, '
+ 'use handoffs_only with care.',
+ ]
+ self.assertEqual(expected, warnings)
+
def test_two_ec_policies(self):
with patch_policies([
StoragePolicy(0, name='zero', is_deprecated=True),
@@ -2345,7 +2422,7 @@ class TestObjectReconstructor(unittest.TestCase):
self.assertEqual(call['node'], sync_to[0])
self.assertEqual(set(call['suffixes']), set(['123', 'abc']))
- def test_process_job_revert_is_handoff(self):
+ def test_process_job_revert_is_handoff_fails(self):
replicas = self.policy.object_ring.replicas
frag_index = random.randint(0, replicas - 1)
sync_to = [random.choice([n for n in self.policy.object_ring.devs
@@ -2375,7 +2452,8 @@ class TestObjectReconstructor(unittest.TestCase):
def ssync_response_callback(*args):
# in this test ssync always fails, until we encounter ourselves in
- # the list of possible handoff's to sync to
+ # the list of possible handoff's to sync to, so handoffs_remaining
+ # should increment
return False, {}
expected_suffix_calls = set([
@@ -2401,6 +2479,7 @@ class TestObjectReconstructor(unittest.TestCase):
call = ssync_calls[0]
self.assertEqual(call['node'], sync_to[0])
self.assertEqual(set(call['suffixes']), set(['123', 'abc']))
+ self.assertEqual(self.reconstructor.handoffs_remaining, 1)
def test_process_job_revert_cleanup(self):
replicas = self.policy.object_ring.replicas
@@ -2446,6 +2525,7 @@ class TestObjectReconstructor(unittest.TestCase):
}
def ssync_response_callback(*args):
+ # success should not increment handoffs_remaining
return True, {ohash: {'ts_data': ts}}
ssync_calls = []
@@ -2469,6 +2549,7 @@ class TestObjectReconstructor(unittest.TestCase):
df_mgr.get_hashes(self.local_dev['device'], str(partition), [],
self.policy)
self.assertFalse(os.access(df._datadir, os.F_OK))
+ self.assertEqual(self.reconstructor.handoffs_remaining, 0)
def test_process_job_revert_cleanup_tombstone(self):
sync_to = [random.choice([n for n in self.policy.object_ring.devs