diff options
-rw-r--r-- | etc/backports_required_for_multiversion_tests.yml | 2 | ||||
-rw-r--r-- | jstests/replsets/catchup_takeover_with_higher_config.js | 125 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.cpp | 13 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_v1_test.cpp | 18 |
5 files changed, 151 insertions, 15 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index c70824550e8..cbb12a10f24 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -165,6 +165,8 @@ all: test_file: jstests/core/txns/timestamped_reads_wait_for_prepare_oplog_visibility.js - ticket: SERVER-59197 test_file: jstests/replsets/sessions_collection_reaping.js + - ticket: SERVER-57262 + test_file: jstests/replsets/catchup_takeover_with_higher_config.js suites: diff --git a/jstests/replsets/catchup_takeover_with_higher_config.js b/jstests/replsets/catchup_takeover_with_higher_config.js new file mode 100644 index 00000000000..f0977e0f812 --- /dev/null +++ b/jstests/replsets/catchup_takeover_with_higher_config.js @@ -0,0 +1,125 @@ +/** + * Test that when a primary is blocked in drain mode, catchup takeover can work even + * if the primary has a lower config than the takeover node. The test starts a 3-node + * replica set and then steps up node1 but blocks it in drain mode before it can bump + * the config term. Next it steps up node2 and also blocks it in drain mode and later + * unblocks node1 to let it finish config term bump so that it has higher config than + * node2. Eventually after catchUpTakeoverDelayMillis has passed, node1 should be able + * get the vote from node2 which has a lower config, and finish the catchup takeover. + */ + +(function() { +'use strict'; + +load("jstests/libs/fail_point_util.js"); +load("jstests/libs/write_concern_util.js"); +load('jstests/replsets/libs/election_metrics.js'); + +// Get the current config from the node and compare it with the provided config. +const getNodeConfigAndCompare = function(node, config, cmp) { + const currentConfig = assert.commandWorked(node.adminCommand({replSetGetConfig: 1})).config; + if (cmp === '=') { + return currentConfig.term === config.term && currentConfig.version === config.version; + } else if (cmp === '>') { + return currentConfig.term > config.term || + (currentConfig.term === config.term && currentConfig.version > config.version); + } else if (cmp === '<') { + return currentConfig.term < config.term || + (currentConfig.term === config.term && currentConfig.version < config.version); + } else { + assert(false); + } +}; + +// Wait for all nodes to acknowledge that the node at nodeIndex is in PRIMARY state. +// A node may see multiple nodes claiming primary, but only checks the provided one. +const waitForPrimaryState = function(nodes, nodeIndex, timeout) { + assert.soon(() => { + for (const node of nodes) { + const status = assert.commandWorked(node.adminCommand({replSetGetStatus: 1})); + if (status.members[nodeIndex].state !== ReplSetTest.State.PRIMARY) { + return false; + } + } + return true; + }, `Failed to wait for primary state for node ${nodes[nodeIndex].host}`, timeout); +}; + +const replSet = new ReplSetTest({name: jsTestName(), nodes: 3}); +const nodes = replSet.startSet(); +let config = replSet.getReplSetConfig(); +// Prevent nodes from syncing from other secondaries. +config.settings = { + chainingAllowed: false, +}; +replSet.initiateWithHighElectionTimeout(config); +replSet.awaitReplication(); +assert.eq(replSet.getPrimary(), nodes[0]); + +// Failpoint to hang node1 before the automatic reconfig on stepup bumps the config term. +const hangBeforeTermBumpFpNode1 = configureFailPoint(nodes[1], "hangBeforeReconfigOnDrainComplete"); +const initialConfig = assert.commandWorked(nodes[0].adminCommand({replSetGetConfig: 1})).config; + +// Stepup node1 and wait to hang before bumping the config term. +const statusBeforeTakeover = assert.commandWorked(nodes[1].adminCommand({serverStatus: 1})); +assert.commandWorked(nodes[1].adminCommand({replSetStepUp: 1})); +hangBeforeTermBumpFpNode1.wait(); + +// Wait for all nodes to acknowledge that node1 has become primary. +jsTestLog(`Waiting for all nodes to agree on ${nodes[1].host} being primary`); +replSet.awaitNodesAgreeOnPrimary(replSet.kDefaultTimeoutMS, nodes, 1); + +// Check that the failpoint worked and the config has not changed. +assert(getNodeConfigAndCompare(nodes[1], initialConfig, '=')); + +// Stepup node2 and wait to hang before bumping the config term as well. +const hangBeforeTermBumpFpNode2 = configureFailPoint(nodes[2], "hangBeforeReconfigOnDrainComplete"); +assert.commandWorked(nodes[2].adminCommand({replSetStepUp: 1})); +hangBeforeTermBumpFpNode2.wait(); + +// Wait for all nodes to acknowledge that node2 has become primary. Cannot use +// awaitNodesAgreeOnPrimary() or getPrimary() here which do not allow a node to +// see multiple primaries. +jsTestLog(`Waiting for all nodes to agree on ${nodes[2].host} being primary`); +waitForPrimaryState(nodes, 2, 30 * 1000); + +// Wait for node0 to change its sync source to node2. Later when the failpoint on node 1 +// is lifted, it will do a no-op write and finish the stepup process, so its lastApplied +// opTime will be greater than the other two nodes. By waiting for sync source change we +// make sure node0 will not pull new entries from node1, making node1 the only eligible +// candidate to catchup takeover node2. +assert.soon(() => { + const status = assert.commandWorked(nodes[0].adminCommand({replSetGetStatus: 1})); + return status.syncSourceHost === nodes[2].host; +}); + +// Lift the failpoint on node1 to let it finish reconfig and bump the config term. +hangBeforeTermBumpFpNode1.off(); + +jsTestLog( + `Waiting for ${nodes[1].host} to finish config term bump and propagate to ${nodes[0].host}`); +assert.soon(() => getNodeConfigAndCompare(nodes[0], initialConfig, '>')); +assert.soon(() => getNodeConfigAndCompare(nodes[1], initialConfig, '>')); +// Check that node2 is still in catchup mode, so it cannot install a new config. +assert(getNodeConfigAndCompare(nodes[2], initialConfig, '=')); + +// Wait for node1 to catchup takeover node2 after the default catchup takeover delay. +jsTestLog(`Waiting for ${nodes[1].host} to catchup takeover ${nodes[2].host}`); +waitForPrimaryState(nodes, 1, 60 * 1000); + +// Check again that node2 is still in catchup mode and has not installed a new config. +assert(getNodeConfigAndCompare(nodes[2], initialConfig, '=')); + +// Lift the failpoint on node2 and wait for all nodes to see node1 as the only primary. +hangBeforeTermBumpFpNode2.off(); +replSet.awaitNodesAgreeOnPrimary(replSet.kDefaultTimeoutMS, nodes, 1); + +// Check that election metrics has been updated with the new reason counter. +const statusAfterTakeover = assert.commandWorked(nodes[1].adminCommand({serverStatus: 1})); +verifyServerStatusElectionReasonCounterChange(statusBeforeTakeover.electionMetrics, + statusAfterTakeover.electionMetrics, + "catchUpTakeover", + 1); + +replSet.stopSet(); +})(); diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index df3fc0c82fd..d37f370a72e 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -126,7 +126,9 @@ MONGO_FAIL_POINT_DEFINE(hangWhileWaitingForHelloResponse); MONGO_FAIL_POINT_DEFINE(skipDurableTimestampUpdates); // Will cause a reconfig to hang after completing the config quorum check. MONGO_FAIL_POINT_DEFINE(omitConfigQuorumCheck); -// Will cause signal drain complete to hang after reconfig +// Will cause signal drain complete to hang before reconfig. +MONGO_FAIL_POINT_DEFINE(hangBeforeReconfigOnDrainComplete); +// Will cause signal drain complete to hang after reconfig. MONGO_FAIL_POINT_DEFINE(hangAfterReconfigOnDrainComplete); // Number of times we tried to go live as a secondary. @@ -1135,6 +1137,10 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx, lk.unlock(); if (needBumpConfigTerm) { + if (MONGO_unlikely(hangBeforeReconfigOnDrainComplete.shouldFail())) { + LOGV2(5726200, "Hanging due to hangBeforeReconfigOnDrainComplete failpoint"); + hangBeforeReconfigOnDrainComplete.pauseWhileSet(opCtx); + } // We re-write the term but keep version the same. This conceptually a no-op // in the config consensus group, analogous to writing a new oplog entry // in Raft log state machine on step up. diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index d46bf2b3c9a..869de681227 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -3101,9 +3101,9 @@ void TopologyCoordinator::processReplSetRequestVotes(const ReplSetRequestVotesAr return; } - if (args.getConfigVersionAndTerm() != _rsConfig.getConfigVersionAndTerm()) { + if (args.getConfigVersionAndTerm() < _rsConfig.getConfigVersionAndTerm()) { response->setVoteGranted(false); - response->setReason("candidate's config with {} differs from mine with {}"_format( + response->setReason("candidate's config with {} is older than mine with {}"_format( args.getConfigVersionAndTerm(), _rsConfig.getConfigVersionAndTerm())); } else if (args.getTerm() < _term) { response->setVoteGranted(false); @@ -3125,8 +3125,15 @@ void TopologyCoordinator::processReplSetRequestVotes(const ReplSetRequestVotesAr _rsConfig.getMemberAt(_lastVote.getCandidateIndex()).getHostAndPort(), _lastVote.getTerm())); } else { + bool isSameConfig = args.getConfigVersionAndTerm() == _rsConfig.getConfigVersionAndTerm(); int betterPrimary = _findHealthyPrimaryOfEqualOrGreaterPriority(args.getCandidateIndex()); - if (_selfConfig().isArbiter() && betterPrimary >= 0) { + // Do not grant vote if we are arbiter and can see a healthy primary of greater or equal + // priority, to prevent primary flapping when there are two nodes that can't talk to each + // other but we that can talk to both as arbiter. We only do this if the voter's config + // is same as ours, otherwise the primary information might be stale and we might not be + // arbiter in the candidate's newer config. We might also hit an invariant described in + // SERVER-46387 without the check for same config. + if (isSameConfig && _selfConfig().isArbiter() && betterPrimary >= 0) { response->setVoteGranted(false); response->setReason( "can see a healthy primary ({}) of equal or greater priority"_format( diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp index 1be2b970576..476adf62f90 100644 --- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp @@ -3491,7 +3491,7 @@ public: TEST_F(ConfigTermAndVersionVoteTest, DataNodeDoesNotGrantVoteWhenConfigVersionIsLower) { auto response = testWithArbiter(false, 1, 2); ASSERT_EQUALS( - "candidate's config with {version: 1, term: 2} differs from mine with" + "candidate's config with {version: 1, term: 2} is older than mine with" " {version: 2, term: 2}", response.getReason()); } @@ -3499,7 +3499,7 @@ TEST_F(ConfigTermAndVersionVoteTest, DataNodeDoesNotGrantVoteWhenConfigVersionIs TEST_F(ConfigTermAndVersionVoteTest, ArbiterDoesNotGrantVoteWhenConfigVersionIsLower) { auto response = testWithArbiter(true, 1, 2); ASSERT_EQUALS( - "candidate's config with {version: 1, term: 2} differs from mine with" + "candidate's config with {version: 1, term: 2} is older than mine with" " {version: 2, term: 2}", response.getReason()); } @@ -3507,7 +3507,7 @@ TEST_F(ConfigTermAndVersionVoteTest, ArbiterDoesNotGrantVoteWhenConfigVersionIsL TEST_F(ConfigTermAndVersionVoteTest, DataNodeDoesNotGrantVoteWhenConfigTermIsLower) { auto response = testWithArbiter(false, 2, 1); ASSERT_EQUALS( - "candidate's config with {version: 2, term: 1} differs from mine with" + "candidate's config with {version: 2, term: 1} is older than mine with" " {version: 2, term: 2}", response.getReason()); } @@ -3515,7 +3515,7 @@ TEST_F(ConfigTermAndVersionVoteTest, DataNodeDoesNotGrantVoteWhenConfigTermIsLow TEST_F(ConfigTermAndVersionVoteTest, ArbiterDoesNotGrantVoteWhenConfigTermIsLower) { auto response = testWithArbiter(true, 2, 1); ASSERT_EQUALS( - "candidate's config with {version: 2, term: 1} differs from mine with" + "candidate's config with {version: 2, term: 1} is older than mine with" " {version: 2, term: 2}", response.getReason()); } @@ -3683,7 +3683,7 @@ TEST_F(TopoCoordTest, NodeDoesNotGrantDryRunVoteWhenConfigVersionIsLower) { getTopoCoord().processReplSetRequestVotes(args, &response); ASSERT_EQUALS( - "candidate's config with {version: 0, term: 1} differs from mine with {version: 1, term: " + "candidate's config with {version: 0, term: 1} is older than mine with {version: 1, term: " "1}", response.getReason()); ASSERT_EQUALS(1, response.getTerm()); @@ -3738,7 +3738,7 @@ TEST_F(TopoCoordTest, NodeDoesNotGrantDryRunVoteWhenTermIsStale) { ASSERT_FALSE(response.getVoteGranted()); } -TEST_F(TopoCoordTest, NodeDoesNotGrantVoteWhenTermIsHigherButConfigVersionIsLower) { +TEST_F(TopoCoordTest, NodeGrantsVoteWhenTermIsHigherButConfigVersionIsLower) { updateConfig(BSON("_id" << "rs0" << "version" << 2 << "term" << 1LL << "members" @@ -3769,11 +3769,7 @@ TEST_F(TopoCoordTest, NodeDoesNotGrantVoteWhenTermIsHigherButConfigVersionIsLowe getTopoCoord().processReplSetRequestVotes(args, &response); // Candidates config(t, v) is (2, 1) and our config is (1, 2). Even though the candidate's // config version is lower, we grant our vote because the candidate's config term is higher. - ASSERT_FALSE(response.getVoteGranted()); - ASSERT_EQ( - "candidate's config with {version: 1, term: 2} differs from mine with {version: 2, term: " - "1}", - response.getReason()); + ASSERT_TRUE(response.getVoteGranted()); } TEST_F(TopoCoordTest, GrantDryRunVoteEvenWhenTermHasBeenSeen) { |