summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWenbin Zhu <wenbin.zhu@mongodb.com>2021-08-11 18:44:26 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-08-11 19:00:00 +0000
commitd21efd2cc4ac16693589f4d7da1cf229c4d08b8d (patch)
tree33563fdc9354a88aaaf1bc6f4324dfc6fdbcecac
parentfb4606ea3efef560014284e99bf5db5755671f58 (diff)
downloadmongo-d21efd2cc4ac16693589f4d7da1cf229c4d08b8d.tar.gz
SERVER-57262 Relax vote constraint to allow voting for candidates with newer config.
(cherry picked from commit e7937256045a2a182c583c7aacaf168e02cf0d06)
-rw-r--r--etc/backports_required_for_multiversion_tests.yml2
-rw-r--r--jstests/replsets/catchup_takeover_with_higher_config.js125
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp8
-rw-r--r--src/mongo/db/repl/topology_coordinator.cpp13
-rw-r--r--src/mongo/db/repl/topology_coordinator_v1_test.cpp18
5 files changed, 151 insertions, 15 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index c70824550e8..cbb12a10f24 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -165,6 +165,8 @@ all:
test_file: jstests/core/txns/timestamped_reads_wait_for_prepare_oplog_visibility.js
- ticket: SERVER-59197
test_file: jstests/replsets/sessions_collection_reaping.js
+ - ticket: SERVER-57262
+ test_file: jstests/replsets/catchup_takeover_with_higher_config.js
suites:
diff --git a/jstests/replsets/catchup_takeover_with_higher_config.js b/jstests/replsets/catchup_takeover_with_higher_config.js
new file mode 100644
index 00000000000..f0977e0f812
--- /dev/null
+++ b/jstests/replsets/catchup_takeover_with_higher_config.js
@@ -0,0 +1,125 @@
+/**
+ * Test that when a primary is blocked in drain mode, catchup takeover can work even
+ * if the primary has a lower config than the takeover node. The test starts a 3-node
+ * replica set and then steps up node1 but blocks it in drain mode before it can bump
+ * the config term. Next it steps up node2 and also blocks it in drain mode and later
+ * unblocks node1 to let it finish config term bump so that it has higher config than
+ * node2. Eventually after catchUpTakeoverDelayMillis has passed, node1 should be able
+ * get the vote from node2 which has a lower config, and finish the catchup takeover.
+ */
+
+(function() {
+'use strict';
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/write_concern_util.js");
+load('jstests/replsets/libs/election_metrics.js');
+
+// Get the current config from the node and compare it with the provided config.
+const getNodeConfigAndCompare = function(node, config, cmp) {
+ const currentConfig = assert.commandWorked(node.adminCommand({replSetGetConfig: 1})).config;
+ if (cmp === '=') {
+ return currentConfig.term === config.term && currentConfig.version === config.version;
+ } else if (cmp === '>') {
+ return currentConfig.term > config.term ||
+ (currentConfig.term === config.term && currentConfig.version > config.version);
+ } else if (cmp === '<') {
+ return currentConfig.term < config.term ||
+ (currentConfig.term === config.term && currentConfig.version < config.version);
+ } else {
+ assert(false);
+ }
+};
+
+// Wait for all nodes to acknowledge that the node at nodeIndex is in PRIMARY state.
+// A node may see multiple nodes claiming primary, but only checks the provided one.
+const waitForPrimaryState = function(nodes, nodeIndex, timeout) {
+ assert.soon(() => {
+ for (const node of nodes) {
+ const status = assert.commandWorked(node.adminCommand({replSetGetStatus: 1}));
+ if (status.members[nodeIndex].state !== ReplSetTest.State.PRIMARY) {
+ return false;
+ }
+ }
+ return true;
+ }, `Failed to wait for primary state for node ${nodes[nodeIndex].host}`, timeout);
+};
+
+const replSet = new ReplSetTest({name: jsTestName(), nodes: 3});
+const nodes = replSet.startSet();
+let config = replSet.getReplSetConfig();
+// Prevent nodes from syncing from other secondaries.
+config.settings = {
+ chainingAllowed: false,
+};
+replSet.initiateWithHighElectionTimeout(config);
+replSet.awaitReplication();
+assert.eq(replSet.getPrimary(), nodes[0]);
+
+// Failpoint to hang node1 before the automatic reconfig on stepup bumps the config term.
+const hangBeforeTermBumpFpNode1 = configureFailPoint(nodes[1], "hangBeforeReconfigOnDrainComplete");
+const initialConfig = assert.commandWorked(nodes[0].adminCommand({replSetGetConfig: 1})).config;
+
+// Stepup node1 and wait to hang before bumping the config term.
+const statusBeforeTakeover = assert.commandWorked(nodes[1].adminCommand({serverStatus: 1}));
+assert.commandWorked(nodes[1].adminCommand({replSetStepUp: 1}));
+hangBeforeTermBumpFpNode1.wait();
+
+// Wait for all nodes to acknowledge that node1 has become primary.
+jsTestLog(`Waiting for all nodes to agree on ${nodes[1].host} being primary`);
+replSet.awaitNodesAgreeOnPrimary(replSet.kDefaultTimeoutMS, nodes, 1);
+
+// Check that the failpoint worked and the config has not changed.
+assert(getNodeConfigAndCompare(nodes[1], initialConfig, '='));
+
+// Stepup node2 and wait to hang before bumping the config term as well.
+const hangBeforeTermBumpFpNode2 = configureFailPoint(nodes[2], "hangBeforeReconfigOnDrainComplete");
+assert.commandWorked(nodes[2].adminCommand({replSetStepUp: 1}));
+hangBeforeTermBumpFpNode2.wait();
+
+// Wait for all nodes to acknowledge that node2 has become primary. Cannot use
+// awaitNodesAgreeOnPrimary() or getPrimary() here which do not allow a node to
+// see multiple primaries.
+jsTestLog(`Waiting for all nodes to agree on ${nodes[2].host} being primary`);
+waitForPrimaryState(nodes, 2, 30 * 1000);
+
+// Wait for node0 to change its sync source to node2. Later when the failpoint on node 1
+// is lifted, it will do a no-op write and finish the stepup process, so its lastApplied
+// opTime will be greater than the other two nodes. By waiting for sync source change we
+// make sure node0 will not pull new entries from node1, making node1 the only eligible
+// candidate to catchup takeover node2.
+assert.soon(() => {
+ const status = assert.commandWorked(nodes[0].adminCommand({replSetGetStatus: 1}));
+ return status.syncSourceHost === nodes[2].host;
+});
+
+// Lift the failpoint on node1 to let it finish reconfig and bump the config term.
+hangBeforeTermBumpFpNode1.off();
+
+jsTestLog(
+ `Waiting for ${nodes[1].host} to finish config term bump and propagate to ${nodes[0].host}`);
+assert.soon(() => getNodeConfigAndCompare(nodes[0], initialConfig, '>'));
+assert.soon(() => getNodeConfigAndCompare(nodes[1], initialConfig, '>'));
+// Check that node2 is still in catchup mode, so it cannot install a new config.
+assert(getNodeConfigAndCompare(nodes[2], initialConfig, '='));
+
+// Wait for node1 to catchup takeover node2 after the default catchup takeover delay.
+jsTestLog(`Waiting for ${nodes[1].host} to catchup takeover ${nodes[2].host}`);
+waitForPrimaryState(nodes, 1, 60 * 1000);
+
+// Check again that node2 is still in catchup mode and has not installed a new config.
+assert(getNodeConfigAndCompare(nodes[2], initialConfig, '='));
+
+// Lift the failpoint on node2 and wait for all nodes to see node1 as the only primary.
+hangBeforeTermBumpFpNode2.off();
+replSet.awaitNodesAgreeOnPrimary(replSet.kDefaultTimeoutMS, nodes, 1);
+
+// Check that election metrics has been updated with the new reason counter.
+const statusAfterTakeover = assert.commandWorked(nodes[1].adminCommand({serverStatus: 1}));
+verifyServerStatusElectionReasonCounterChange(statusBeforeTakeover.electionMetrics,
+ statusAfterTakeover.electionMetrics,
+ "catchUpTakeover",
+ 1);
+
+replSet.stopSet();
+})();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index df3fc0c82fd..d37f370a72e 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -126,7 +126,9 @@ MONGO_FAIL_POINT_DEFINE(hangWhileWaitingForHelloResponse);
MONGO_FAIL_POINT_DEFINE(skipDurableTimestampUpdates);
// Will cause a reconfig to hang after completing the config quorum check.
MONGO_FAIL_POINT_DEFINE(omitConfigQuorumCheck);
-// Will cause signal drain complete to hang after reconfig
+// Will cause signal drain complete to hang before reconfig.
+MONGO_FAIL_POINT_DEFINE(hangBeforeReconfigOnDrainComplete);
+// Will cause signal drain complete to hang after reconfig.
MONGO_FAIL_POINT_DEFINE(hangAfterReconfigOnDrainComplete);
// Number of times we tried to go live as a secondary.
@@ -1135,6 +1137,10 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
lk.unlock();
if (needBumpConfigTerm) {
+ if (MONGO_unlikely(hangBeforeReconfigOnDrainComplete.shouldFail())) {
+ LOGV2(5726200, "Hanging due to hangBeforeReconfigOnDrainComplete failpoint");
+ hangBeforeReconfigOnDrainComplete.pauseWhileSet(opCtx);
+ }
// We re-write the term but keep version the same. This conceptually a no-op
// in the config consensus group, analogous to writing a new oplog entry
// in Raft log state machine on step up.
diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp
index d46bf2b3c9a..869de681227 100644
--- a/src/mongo/db/repl/topology_coordinator.cpp
+++ b/src/mongo/db/repl/topology_coordinator.cpp
@@ -3101,9 +3101,9 @@ void TopologyCoordinator::processReplSetRequestVotes(const ReplSetRequestVotesAr
return;
}
- if (args.getConfigVersionAndTerm() != _rsConfig.getConfigVersionAndTerm()) {
+ if (args.getConfigVersionAndTerm() < _rsConfig.getConfigVersionAndTerm()) {
response->setVoteGranted(false);
- response->setReason("candidate's config with {} differs from mine with {}"_format(
+ response->setReason("candidate's config with {} is older than mine with {}"_format(
args.getConfigVersionAndTerm(), _rsConfig.getConfigVersionAndTerm()));
} else if (args.getTerm() < _term) {
response->setVoteGranted(false);
@@ -3125,8 +3125,15 @@ void TopologyCoordinator::processReplSetRequestVotes(const ReplSetRequestVotesAr
_rsConfig.getMemberAt(_lastVote.getCandidateIndex()).getHostAndPort(),
_lastVote.getTerm()));
} else {
+ bool isSameConfig = args.getConfigVersionAndTerm() == _rsConfig.getConfigVersionAndTerm();
int betterPrimary = _findHealthyPrimaryOfEqualOrGreaterPriority(args.getCandidateIndex());
- if (_selfConfig().isArbiter() && betterPrimary >= 0) {
+ // Do not grant vote if we are arbiter and can see a healthy primary of greater or equal
+ // priority, to prevent primary flapping when there are two nodes that can't talk to each
+ // other but we that can talk to both as arbiter. We only do this if the voter's config
+ // is same as ours, otherwise the primary information might be stale and we might not be
+ // arbiter in the candidate's newer config. We might also hit an invariant described in
+ // SERVER-46387 without the check for same config.
+ if (isSameConfig && _selfConfig().isArbiter() && betterPrimary >= 0) {
response->setVoteGranted(false);
response->setReason(
"can see a healthy primary ({}) of equal or greater priority"_format(
diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
index 1be2b970576..476adf62f90 100644
--- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp
+++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
@@ -3491,7 +3491,7 @@ public:
TEST_F(ConfigTermAndVersionVoteTest, DataNodeDoesNotGrantVoteWhenConfigVersionIsLower) {
auto response = testWithArbiter(false, 1, 2);
ASSERT_EQUALS(
- "candidate's config with {version: 1, term: 2} differs from mine with"
+ "candidate's config with {version: 1, term: 2} is older than mine with"
" {version: 2, term: 2}",
response.getReason());
}
@@ -3499,7 +3499,7 @@ TEST_F(ConfigTermAndVersionVoteTest, DataNodeDoesNotGrantVoteWhenConfigVersionIs
TEST_F(ConfigTermAndVersionVoteTest, ArbiterDoesNotGrantVoteWhenConfigVersionIsLower) {
auto response = testWithArbiter(true, 1, 2);
ASSERT_EQUALS(
- "candidate's config with {version: 1, term: 2} differs from mine with"
+ "candidate's config with {version: 1, term: 2} is older than mine with"
" {version: 2, term: 2}",
response.getReason());
}
@@ -3507,7 +3507,7 @@ TEST_F(ConfigTermAndVersionVoteTest, ArbiterDoesNotGrantVoteWhenConfigVersionIsL
TEST_F(ConfigTermAndVersionVoteTest, DataNodeDoesNotGrantVoteWhenConfigTermIsLower) {
auto response = testWithArbiter(false, 2, 1);
ASSERT_EQUALS(
- "candidate's config with {version: 2, term: 1} differs from mine with"
+ "candidate's config with {version: 2, term: 1} is older than mine with"
" {version: 2, term: 2}",
response.getReason());
}
@@ -3515,7 +3515,7 @@ TEST_F(ConfigTermAndVersionVoteTest, DataNodeDoesNotGrantVoteWhenConfigTermIsLow
TEST_F(ConfigTermAndVersionVoteTest, ArbiterDoesNotGrantVoteWhenConfigTermIsLower) {
auto response = testWithArbiter(true, 2, 1);
ASSERT_EQUALS(
- "candidate's config with {version: 2, term: 1} differs from mine with"
+ "candidate's config with {version: 2, term: 1} is older than mine with"
" {version: 2, term: 2}",
response.getReason());
}
@@ -3683,7 +3683,7 @@ TEST_F(TopoCoordTest, NodeDoesNotGrantDryRunVoteWhenConfigVersionIsLower) {
getTopoCoord().processReplSetRequestVotes(args, &response);
ASSERT_EQUALS(
- "candidate's config with {version: 0, term: 1} differs from mine with {version: 1, term: "
+ "candidate's config with {version: 0, term: 1} is older than mine with {version: 1, term: "
"1}",
response.getReason());
ASSERT_EQUALS(1, response.getTerm());
@@ -3738,7 +3738,7 @@ TEST_F(TopoCoordTest, NodeDoesNotGrantDryRunVoteWhenTermIsStale) {
ASSERT_FALSE(response.getVoteGranted());
}
-TEST_F(TopoCoordTest, NodeDoesNotGrantVoteWhenTermIsHigherButConfigVersionIsLower) {
+TEST_F(TopoCoordTest, NodeGrantsVoteWhenTermIsHigherButConfigVersionIsLower) {
updateConfig(BSON("_id"
<< "rs0"
<< "version" << 2 << "term" << 1LL << "members"
@@ -3769,11 +3769,7 @@ TEST_F(TopoCoordTest, NodeDoesNotGrantVoteWhenTermIsHigherButConfigVersionIsLowe
getTopoCoord().processReplSetRequestVotes(args, &response);
// Candidates config(t, v) is (2, 1) and our config is (1, 2). Even though the candidate's
// config version is lower, we grant our vote because the candidate's config term is higher.
- ASSERT_FALSE(response.getVoteGranted());
- ASSERT_EQ(
- "candidate's config with {version: 1, term: 2} differs from mine with {version: 2, term: "
- "1}",
- response.getReason());
+ ASSERT_TRUE(response.getVoteGranted());
}
TEST_F(TopoCoordTest, GrantDryRunVoteEvenWhenTermHasBeenSeen) {