summaryrefslogtreecommitdiff
path: root/test/probe
diff options
context:
space:
mode:
authorJianjian Huo <jhuo@nvidia.com>2022-08-24 12:34:04 -0700
committerJianjian Huo <jhuo@nvidia.com>2022-09-09 11:04:43 -0700
commita53270a15a97c4e4d1c384a1cd10fef0a36af9b5 (patch)
treebe0766ab3f29cec0dcf1a798dcd1eac33f340af1 /test/probe
parentc9b898972f9b28e703febc0a98e80177dbd169ca (diff)
downloadswift-a53270a15a97c4e4d1c384a1cd10fef0a36af9b5.tar.gz
swift-manage-shard-ranges repair: check for parent-child overlaps.
Stuck shard ranges have been seen in the production, root cause has been traced back to that s-m-s-r failed to detect parent-child relationship in overlaps and it either shrinked child shard ranges into parents or the other way around. A patch has been added to check minimum age before s-m-s-r performs repair, which will most likely prevent this from happening again, but we also need to check for parent-child relationship in overlaps explicitly during repairs. This patch will do that and remove parent or child shard ranges from doners, and prevent s-m-s-r from shrinking them into acceptor shard ranges. Drive-by 1: fixup gap repair probe test. The probe test is no longer appropriate because we're no longer allowed to repair parent-child overlaps, so replace the test with a manually created gap. Drive-by 2: address probe test TODOs. The commented assertion would fail because the node filtering comparison failed to account for the same node having different indexes when generated for the root versus the shard. Adding a new iterable function filter_nodes makes the node filtering behave as expected. Co-Authored-By: Alistair Coles <alistairncoles@gmail.com> Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com> Change-Id: Iaa89e94a2746ba939fb62449e24bdab9666d7bab
Diffstat (limited to 'test/probe')
-rw-r--r--test/probe/common.py22
-rw-r--r--test/probe/test_sharder.py116
2 files changed, 113 insertions, 25 deletions
diff --git a/test/probe/common.py b/test/probe/common.py
index 096cc69e5..54212bf8a 100644
--- a/test/probe/common.py
+++ b/test/probe/common.py
@@ -18,6 +18,7 @@ from __future__ import print_function
import errno
import gc
import json
+import mock
import os
from subprocess import Popen, PIPE
import sys
@@ -345,6 +346,27 @@ class Body(object):
next = __next__
+def exclude_nodes(nodes, *excludes):
+ """
+ Iterate over ``nodes`` yielding only those not in ``excludes``.
+
+ The index key of the node dicts is ignored when matching nodes against the
+ ``excludes`` nodes. Index is not a fundamental property of a node but a
+ variable annotation added by the Ring depending upon the partition for
+ which the nodes were generated.
+
+ :param nodes: an iterable of node dicts.
+ :param *excludes: one or more node dicts that should not be yielded.
+ :return: yields node dicts.
+ """
+ for node in nodes:
+ match_node = {k: mock.ANY if k == 'index' else v
+ for k, v in node.items()}
+ if any(exclude == match_node for exclude in excludes):
+ continue
+ yield node
+
+
class ProbeTest(unittest.TestCase):
"""
Don't instantiate this directly, use a child class instead.
diff --git a/test/probe/test_sharder.py b/test/probe/test_sharder.py
index 68cace6ff..71fac1233 100644
--- a/test/probe/test_sharder.py
+++ b/test/probe/test_sharder.py
@@ -41,7 +41,7 @@ from test import annotate_failure
from test.probe import PROXY_BASE_URL
from test.probe.brain import BrainSplitter
from test.probe.common import ReplProbeTest, get_server_number, \
- wait_for_server_to_hangup, ENABLED_POLICIES
+ wait_for_server_to_hangup, ENABLED_POLICIES, exclude_nodes
import mock
try:
@@ -3427,9 +3427,9 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
self.assert_container_listing(expected_obj_names)
- def test_manage_shard_ranges_repair_root_shrinking_gaps(self):
- # provoke shrinking/shrunk gaps by prematurely repairing a transient
- # overlap in root container; repair the gap.
+ def test_manage_shard_ranges_repair_parent_child_ranges(self):
+ # Test repairing a transient parent-child shard range overlap in the
+ # root container, expect no repairs to be done.
# note: be careful not to add a container listing to this test which
# would get shard ranges into memcache
obj_names = self._make_object_names(4)
@@ -3477,19 +3477,17 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
shard_ranges[0].account, shard_ranges[0].container)
self.container_replicators.once(
additional_args='--partitions=%s' % shard_part)
- # TODO: get this assertion working (node filtering wonky??)
- # for node in [n for n in shard_nodes if n != self.brain.nodes[0]]:
- # self.assert_container_state(
- # node, 'unsharded', 2, account=shard_ranges[0].account,
- # container=shard_ranges[0].container, part=shard_part)
+ for node in exclude_nodes(shard_nodes, self.brain.nodes[0]):
+ self.assert_container_state(
+ node, 'unsharded', 2, account=shard_ranges[0].account,
+ container=shard_ranges[0].container, part=shard_part)
self.sharders_once(additional_args='--partitions=%s' % shard_part)
# get shards to update state from parent...
self.sharders_once()
- # TODO: get this assertion working (node filtering wonky??)
- # for node in [n for n in shard_nodes if n != self.brain.nodes[0]]:
- # self.assert_container_state(
- # node, 'sharded', 2, account=shard_ranges[0].account,
- # container=shard_ranges[0].container, part=shard_part)
+ for node in exclude_nodes(shard_nodes, self.brain.nodes[0]):
+ self.assert_container_state(
+ node, 'sharded', 2, account=shard_ranges[0].account,
+ container=shard_ranges[0].container, part=shard_part)
# put an object into the second of the 2 sub-shards so that the shard
# will update the root next time the sharder is run; do this before
@@ -3540,34 +3538,96 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
[(sr.state, sr.deleted, sr.lower, sr.upper)
for sr in root_brokers[0].get_shard_ranges(include_deleted=True)])
- # we are allowed to fix the overlap...
+ # try to fix the overlap and expect no repair has been done.
msg = self.assert_subprocess_success(
['swift-manage-shard-ranges', root_0_db_file, 'repair', '--yes',
'--min-shard-age', '0'])
self.assertIn(
- b'Repairs necessary to remove overlapping shard ranges.', msg)
+ b'1 donor shards ignored due to parent-child relationship checks',
+ msg)
+ # verify parent-child checks has prevented repair to be done.
self.assertEqual(
[(ShardRange.ACTIVE, False, ShardRange.MIN, obj_names[1]),
- (ShardRange.SHRINKING, False, obj_names[0], obj_names[1]),
+ # note: overlap!
+ (ShardRange.ACTIVE, False, obj_names[0], obj_names[1]),
(ShardRange.ACTIVE, False, obj_names[1], ShardRange.MAX)],
[(sr.state, sr.deleted, sr.lower, sr.upper)
for sr in root_brokers[0].get_shard_ranges(include_deleted=True)])
+ # the transient overlap is 'fixed' in subsequent sharder cycles...
self.sharders_once()
self.sharders_once()
self.container_replicators.once()
- # boo :'( ... we made gap
for broker in root_brokers:
self.assertEqual(
[(ShardRange.ACTIVE, False, ShardRange.MIN, obj_names[0]),
+ (ShardRange.ACTIVE, False, obj_names[0], obj_names[1]),
(ShardRange.SHARDED, True, ShardRange.MIN, obj_names[1]),
- (ShardRange.SHRUNK, True, obj_names[0], obj_names[1]),
(ShardRange.ACTIVE, False, obj_names[1], ShardRange.MAX)],
[(sr.state, sr.deleted, sr.lower, sr.upper)
for sr in broker.get_shard_ranges(include_deleted=True)])
+ def test_manage_shard_ranges_repair_root_gap(self):
+ # create a gap in root container; repair the gap.
+ # note: be careful not to add a container listing to this test which
+ # would get shard ranges into memcache
+ obj_names = self._make_object_names(8)
+ self.put_objects(obj_names)
+
+ client.post_container(self.url, self.admin_token, self.container_name,
+ headers={'X-Container-Sharding': 'on'})
+
+ # run replicators first time to get sync points set
+ self.container_replicators.once(
+ additional_args='--partitions=%s' % self.brain.part)
+
+ # shard root
+ root_0_db_file = self.get_db_file(self.brain.part, self.brain.nodes[0])
+ self.assert_subprocess_success([
+ 'swift-manage-shard-ranges',
+ root_0_db_file,
+ 'find_and_replace', '2', '--enable'])
+ self.container_replicators.once(
+ additional_args='--partitions=%s' % self.brain.part)
+ for node in self.brain.nodes:
+ self.assert_container_state(node, 'unsharded', 4)
+ self.sharders_once(additional_args='--partitions=%s' % self.brain.part)
+ # get shards to update state from parent...
+ self.sharders_once()
+ for node in self.brain.nodes:
+ self.assert_container_state(node, 'sharded', 4)
+
+ # sanity check, all is well
+ msg = self.assert_subprocess_success([
+ 'swift-manage-shard-ranges', root_0_db_file, 'repair', '--gaps',
+ '--dry-run'])
+ self.assertIn(b'No repairs necessary.', msg)
+
+ # deliberately create a gap in root shard ranges (don't ever do this
+ # for real)
+ # TODO: replace direct broker modification with s-m-s-r merge
+ root_brokers = [self.get_broker(self.brain.part, node)
+ for node in self.brain.nodes]
+ shard_ranges = root_brokers[0].get_shard_ranges()
+ self.assertEqual(4, len(shard_ranges))
+ shard_ranges[2].set_deleted()
+ root_brokers[0].merge_shard_ranges(shard_ranges)
+ shard_ranges = root_brokers[0].get_shard_ranges()
+ self.assertEqual(3, len(shard_ranges))
+ self.container_replicators.once()
+
+ # confirm that we made a gap.
+ for broker in root_brokers:
+ self.assertEqual(
+ [(ShardRange.ACTIVE, False, ShardRange.MIN, obj_names[1]),
+ (ShardRange.ACTIVE, False, obj_names[1], obj_names[3]),
+ (ShardRange.ACTIVE, True, obj_names[3], obj_names[5]),
+ (ShardRange.ACTIVE, False, obj_names[5], ShardRange.MAX)],
+ [(sr.state, sr.deleted, sr.lower, sr.upper)
+ for sr in broker.get_shard_ranges(include_deleted=True)])
+
msg = self.assert_subprocess_success([
'swift-manage-shard-ranges', root_0_db_file, 'repair', '--gaps',
'--yes'])
@@ -3580,10 +3640,10 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
# yay! we fixed the gap (without creating an overlap)
for broker in root_brokers:
self.assertEqual(
- [(ShardRange.ACTIVE, False, ShardRange.MIN, obj_names[0]),
- (ShardRange.SHARDED, True, ShardRange.MIN, obj_names[1]),
- (ShardRange.SHRUNK, True, obj_names[0], obj_names[1]),
- (ShardRange.ACTIVE, False, obj_names[0], ShardRange.MAX)],
+ [(ShardRange.ACTIVE, False, ShardRange.MIN, obj_names[1]),
+ (ShardRange.ACTIVE, False, obj_names[1], obj_names[3]),
+ (ShardRange.ACTIVE, True, obj_names[3], obj_names[5]),
+ (ShardRange.ACTIVE, False, obj_names[3], ShardRange.MAX)],
[(sr.state, sr.deleted, sr.lower, sr.upper)
for sr in broker.get_shard_ranges(include_deleted=True)])
@@ -3596,8 +3656,14 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
'--dry-run'])
self.assertIn(b'No repairs necessary.', msg)
- self.assert_container_listing(
- [obj_names[0], new_obj_name] + obj_names[1:])
+ # put an object into the gap namespace
+ new_objs = [obj_names[4] + 'a']
+ self.put_objects(new_objs)
+ # get root stats up to date
+ self.sharders_once()
+ # new object is in listing but old objects in the gap have been lost -
+ # don't delete shard ranges!
+ self.assert_container_listing(obj_names[:4] + new_objs + obj_names[6:])
def test_manage_shard_ranges_unsharded_deleted_root(self):
# verify that a deleted DB will still be sharded